【强化学习】tensorflow2.x 构造 SoftActorCritic(SAC) 训练 LunarLanderContinuous-v2

13 篇文章 0 订阅
11 篇文章 0 订阅

  • 论文地址.
  • 策略网络的损失函数不是按照原论文所写。(X)
  • 更新:策略网络的损失函数已按照原论文更新。
  • 更新:加入了自适应温度参数 alpha 控制策略熵。

requirements.txt:
tensorflow-gpu==2.4.0
gym[all]==0.21.0
tensorflow_probability==0.14.0
keras==2.6.0
matplotlib==3.5.1
from tensorflow.keras import layers, models, Input, optimizers, losses
from tensorflow_probability.python.distributions import Normal
from collections import deque

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random
import copy
import gym

class SoftActorCritic:
    def __init__(self, state_shape, action_dim):
        self.ema = tf.train.ExponentialMovingAverage(decay=0.995)
        self.replay_buffer = deque(maxlen=10000)
        self.gamma = 0.997

        self.log_aplha = tf.Variable(np.random.normal(), trainable=True, name="EntropyTemperature")
        self.mini_entropy = 0.1

        self.policy_OPT = optimizers.Adam(learning_rate=1e-3)
        self.Q1_OPT = optimizers.Adam(learning_rate=1e-3)
        self.Q2_OPT = optimizers.RMSprop(learning_rate=1e-3)
        self.value_OPT = optimizers.Adam(learning_rate=1e-3)
        self.alpha_OPT = optimizers.SGD(learning_rate=1e-3)

        policy_input = Input(shape=state_shape)
        x = layers.Dense(units=1024, activation='relu')(policy_input)
        x = layers.Dense(units=1024, activation='relu')(x)
        policy_mean = layers.Dense(units=action_dim, activation='linear')(x)
        log_policy_std = layers.Dense(units=action_dim, activation='linear')(x)
        log_policy_std_clipped = tf.clip_by_value(log_policy_std, -10, 2)
        self.policy_network = models.Model(inputs=policy_input, outputs=[policy_mean, log_policy_std_clipped])

        value_input = Input(shape=state_shape)
        x = layers.Dense(units=1024, activation='relu')(value_input)
        x = layers.Dense(units=1024, activation='relu')(x)
        value_output = layers.Dense(units=1, activation='linear')(x)
        self.value_network = models.Model(inputs=value_input, outputs=value_output)
        self.target_value_network = models.clone_model(self.value_network)
        self._update_target_value_network()

        Q_state_input = Input(shape=state_shape)
        Q_action_input = Input(shape=(action_dim))
        x = layers.concatenate([Q_state_input, Q_action_input])
        x = layers.Dense(units=1024, activation='relu')(x)
        x = layers.Dense(units=1024, activation='relu')(x)
        Q_output = layers.Dense(units=1, activation='linear')(x)
        self.Q_network_1 = models.Model(inputs=[Q_state_input, Q_action_input], outputs=Q_output)
        self.Q_network_2 = models.clone_model(self.Q_network_1)

    def _update_target_value_network(self):
        self.ema.apply(self.value_network.trainable_variables)
        for target_value_network_para, value_network_para in zip(self.target_value_network.trainable_variables, self.value_network.trainable_variables):
            target_value_network_para.assign(self.ema.average(value_network_para))

    def save_memory(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def select_action(self, state):
        state = np.array([state])
        policy_mean, log_policy_std = self.policy_network(state)
        policy_mean = np.array(policy_mean[0])
        log_policy_std = np.array(log_policy_std[0])
        policy_std = np.exp(log_policy_std)
        gaussian_distribution = Normal(policy_mean, policy_std)
        action = np.tanh(gaussian_distribution.sample())
        return action

    def update_weights(self, batch_size):
        batch_size = min(batch_size, len(self.replay_buffer))
        training_data = random.sample(self.replay_buffer, batch_size)
        state, action, reward, next_state, done = [], [], [], [], []
        for data in training_data:
            s, a, r, n_s, d = data
            state.append(s)
            action.append(a)
            reward.append(r)
            next_state.append(n_s)
            done.append(d)
        state = np.array(state, dtype=np.float64)
        action = np.array(action, dtype=np.float64)
        reward = np.reshape(reward, newshape=(-1, 1))
        next_state = np.array(next_state, dtype=np.float64)
        done = np.reshape(done, newshape=(-1, 1))

        with tf.GradientTape() as tape:
            policy_mean, log_policy_std = self.policy_network(state)
            policy_std = tf.exp(log_policy_std)

            gaussian_distribution = Normal(policy_mean, policy_std)
            gaussian_sampling = gaussian_distribution.sample()

            sample_action = tf.tanh(gaussian_sampling)
            logprob = gaussian_distribution.log_prob(gaussian_sampling) - tf.math.log(
                1.0 - tf.pow(sample_action, 2) + 1e-6)

            logprob = tf.reduce_mean(logprob, axis=-1, keepdims=True)
            new_Q_value = tf.math.minimum(self.Q_network_1([state, sample_action]), self.Q_network_2([state, sample_action]))

            policy_loss = tf.reduce_mean(np.exp(self.log_aplha) * logprob - new_Q_value)

        policy_network_grad = tape.gradient(policy_loss, self.policy_network.trainable_variables)
        self.policy_OPT.apply_gradients(zip(policy_network_grad, self.policy_network.trainable_variables))

        with tf.GradientTape() as tape:
            alpha_loss = - tf.exp(self.log_aplha) * (tf.reduce_mean(tf.exp(logprob) * logprob) + self.mini_entropy)
        alpha_grad = tape.gradient(alpha_loss, [self.log_aplha])
        self.alpha_OPT.apply_gradients(zip(alpha_grad, [self.log_aplha]))

        with tf.GradientTape() as tape:
            value = self.value_network(state)
            value_ = tf.stop_gradient(new_Q_value - np.exp(self.log_aplha) * logprob)
            value_loss = tf.reduce_mean(losses.mean_squared_error(value_, value))
        value_network_grad = tape.gradient(value_loss, self.value_network.trainable_variables)
        self.value_OPT.apply_gradients(zip(value_network_grad, self.value_network.trainable_variables))

        target_value = tf.stop_gradient(self.target_value_network(next_state))
        Q_ = reward + self.gamma * (1 - done) * target_value

        with tf.GradientTape() as tape:
            Q_1 = self.Q_network_1([state, action])
            Q_1_loss = tf.reduce_mean(losses.mean_squared_error(Q_, Q_1))
        Q_network_1_grad = tape.gradient(Q_1_loss, self.Q_network_1.trainable_variables)
        self.Q1_OPT.apply_gradients(zip(Q_network_1_grad, self.Q_network_1.trainable_variables))

        with tf.GradientTape() as tape:
            Q_2 = self.Q_network_2([state, action])
            Q_2_loss = tf.reduce_mean(losses.mean_squared_error(Q_, Q_2))
        Q_network_2_grad = tape.gradient(Q_2_loss, self.Q_network_2.trainable_variables)
        self.Q2_OPT.apply_gradients(zip(Q_network_2_grad, self.Q_network_2.trainable_variables))

        self._update_target_value_network()
        return (
            np.array(Q_1_loss, dtype=np.float64),
            np.array(Q_2_loss, dtype=np.float64),
            np.array(policy_loss, dtype=np.float64),
            np.array(value_loss, dtype=np.float64),
            np.array(alpha_loss, dtype=np.float64),
            np.exp(self.log_aplha)
        )

    def save_weights(self, path):
        self.policy_network.save_weights(path + '-policy_network.h5')
        self.value_network.save_weights(path + '-value_network.h5')
        self.Q_network_1.save_weights(path + '-Q_network_1.h5')
        self.Q_network_2.save_weights(path + '-Q_network_2.h5')

    def load_weights(self, path):
        self.policy_network.load_weights(path + '-policy_network.h5')
        self.value_network.load_weights(path + '-value_network.h5')
        self.Q_network_1.load_weights(path + '-Q_network_1.h5')
        self.Q_network_2.load_weights(path + '-Q_network_2.h5')

if __name__ == '__main__':
    RENDER = False
    EPISODES = 2000
    BATCH_SIZE = 256
    env = gym.make('LunarLanderContinuous-v2')
    agent = SoftActorCritic((8), 2)
    # agent.load_weights('./LunarLanderContinuous-v2')
    loss_list = []
    reward_list = []
    _100_window_reward_list = []
    f = open('log.txt', 'w')
    for e in range(EPISODES):
        state = env.reset()
        rewards = 0
        while True:
            if RENDER:
                env.render()
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            rewards += reward
            agent.save_memory(state, action, reward, next_state, done)
            state = copy.deepcopy(next_state)

            if done: break

        Q1_loss, Q2_loss, policy_loss, value_loss, alpha_loss, alpha = agent.update_weights(BATCH_SIZE)
        loss_list.append(np.sum([Q1_loss, Q2_loss, policy_loss, value_loss]))
        reward_list.append(rewards)
        _100_window_reward = sum(reward_list[-100:]) / len(reward_list[-100:])
        _100_window_reward_list.append(_100_window_reward)
        log = """
        ==============================================================================
        |>episode: {}/{}
        |>memory length: {}
        |>losses 
        |    >>
        |        Q1_loss: {}, Q2_loss: {}, 
        |        policy_loss: {}, value_loss: {}
        |        alpha: {}, alpha_loss: {}
        |    << 
        |>score: {}, avg score: {}
        ==============================================================================
        """.format(
            e + 1, EPISODES, len(agent.replay_buffer),
            Q1_loss, Q2_loss, policy_loss, value_loss, alpha, alpha_loss,
            rewards, _100_window_reward
        )
        f.write(log)
        print("episode: {}/{}, score: {}, avg_score: {}".format(e+1, EPISODES, rewards, _100_window_reward))
        agent.save_weights('./LunarLanderContinuous-v2')
    f.close()
    plt.plot(reward_list)
    plt.plot(_100_window_reward_list)
    plt.show()
    plt.plot(loss_list)
    plt.show()

reward 曲线

请添加图片描述

  • 3
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 9
    评论
以下是一个使用TensorFlow 1.x实现Soft Actor-CriticSAC)算法的代码示例: ```python import tensorflow as tf import numpy as np import gym # Hyperparameters gamma = 0.99 tau = 0.005 hidden_sizes = [256, 256] batch_size = 100 buffer_size = 10000 learning_rate = 0.0003 alpha = 0.2 target_entropy = -1.0 # Create the replay buffer class ReplayBuffer: def __init__(self, obs_dim, act_dim, size): self.obs_buf = np.zeros((size, obs_dim), dtype=np.float32) self.act_buf = np.zeros((size, act_dim), dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.next_obs_buf = np.zeros((size, obs_dim), dtype=np.float32) self.done_buf = np.zeros(size, dtype=np.float32) self.ptr, self.size, self.max_size = 0, 0, size def store(self, obs, act, rew, next_obs, done): self.obs_buf[self.ptr] = obs self.act_buf[self.ptr] = act self.rew_buf[self.ptr] = rew self.next_obs_buf[self.ptr] = next_obs self.done_buf[self.ptr] = done self.ptr = (self.ptr+1) % self.max_size self.size = min(self.size+1, self.max_size) def sample_batch(self, batch_size=batch_size): idxs = np.random.randint(0, self.size, size=batch_size) return dict(obs=self.obs_buf[idxs], act=self.act_buf[idxs], rew=self.rew_buf[idxs], next_obs=self.next_obs_buf[idxs], done=self.done_buf[idxs]) # Create the actor and critic networks class MLP(tf.keras.Model): def __init__(self, sizes, activation=tf.nn.relu, output_activation=None): super(MLP, self).__init__() self.layers_ = [] for i, size in enumerate(sizes[:-1]): layer = tf.keras.layers.Dense(units=size, activation=activation) self.layers_.append(layer) self.layers_.append(tf.keras.layers.Dense(units=sizes[-1], activation=output_activation)) def call(self, inputs): x = inputs for layer in self.layers_: x = layer(x) return x class ActorCritic(tf.keras.Model): def __init__(self, obs_dim, act_dim, hidden_sizes, activation=tf.nn.relu, output_activation=None): super(ActorCritic, self).__init__() self.q1 = MLP(hidden_sizes + [1], activation, output_activation) self.q2 = MLP(hidden_sizes + [1], activation, output_activation) self.v = MLP(hidden_sizes + [1], activation, output_activation) self.pi = MLP(hidden_sizes + [act_dim], activation, tf.nn.tanh) def call(self, obs, act=None): q1 = self.q1(tf.concat([obs, act], axis=-1)) q2 = self.q2(tf.concat([obs, act], axis=-1)) v = self.v(obs) pi = self.pi(obs) return q1, q2, v, pi def act(self, obs): pi = self.pi(obs) return pi.numpy() # Create the SAC agent class SAC: def __init__(self, obs_dim, act_dim, hidden_sizes, buffer_size, batch_size, learning_rate, alpha, gamma, tau, target_entropy): self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) self.v_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) self.pi_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=buffer_size) self.batch_size = batch_size self.alpha = alpha self.gamma = gamma self.tau = tau self.target_entropy = target_entropy self.obs_dim = obs_dim self.act_dim = act_dim self.hidden_sizes = hidden_sizes self.actor_critic = ActorCritic(obs_dim, act_dim, hidden_sizes) def update(self, data): obs = data['obs'] act = data['act'] rew = data['rew'] next_obs = data['next_obs'] done = data['done'] with tf.GradientTape(persistent=True) as tape: q1, q2, v, pi = self.actor_critic(obs, act) _, _, _, next_pi = self.actor_critic(next_obs) v_target = self.target_v(next_obs, next_pi) q_target = rew + self.gamma * (1 - done) * v_target q1_loss = tf.reduce_mean(tf.square(q1 - q_target)) q2_loss = tf.reduce_mean(tf.square(q2 - q_target)) v_loss = tf.reduce_mean(tf.square(v - v_target)) pi_loss = tf.reduce_mean(self.alpha * pi.log_prob(act) - q1) alpha_loss = tf.reduce_mean(-self.alpha * (self.target_entropy - pi.entropy())) q1_grads = tape.gradient(q1_loss, self.actor_critic.q1.trainable_variables) self.q_optimizer.apply_gradients(zip(q1_grads, self.actor_critic.q1.trainable_variables)) q2_grads = tape.gradient(q2_loss, self.actor_critic.q2.trainable_variables) self.q_optimizer.apply_gradients(zip(q2_grads, self.actor_critic.q2.trainable_variables)) v_grads = tape.gradient(v_loss, self.actor_critic.v.trainable_variables) self.v_optimizer.apply_gradients(zip(v_grads, self.actor_critic.v.trainable_variables)) pi_grads = tape.gradient(pi_loss, self.actor_critic.pi.trainable_variables) self.pi_optimizer.apply_gradients(zip(pi_grads, self.actor_critic.pi.trainable_variables)) alpha_grads = tape.gradient(alpha_loss, [self.alpha]) self.alpha = tf.clip_by_value(self.alpha - learning_rate * alpha_grads[0], 0, 1) del tape def target_v(self, obs, pi): q1, q2, _, _ = self.actor_critic(obs, pi) q = tf.minimum(q1, q2) v_target = tf.reduce_mean(q - self.alpha * pi.entropy()) return v_target def train(self): data = self.replay_buffer.sample_batch(self.batch_size) self.update(data) def store(self, obs, act, rew, next_obs, done): self.replay_buffer.store(obs, act, rew, next_obs, done) def act(self, obs): return self.actor_critic.act(obs) def save(self, save_path): self.actor_critic.save_weights(save_path) def load(self, load_path): self.actor_critic.load_weights(load_path) # Train the SAC agent on the gym environment env = gym.make('Pendulum-v0') obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] sac = SAC(obs_dim, act_dim, hidden_sizes, buffer_size, batch_size, learning_rate, alpha, gamma, tau, target_entropy) for i in range(1000): obs = env.reset() total_reward = 0 done = False while not done: act = sac.act(obs.reshape(1, -1)) next_obs, rew, done, _ = env.step(act[0]) sac.store(obs, act, rew, next_obs, done) sac.train() obs = next_obs total_reward += rew print('Epoch: {}, Total Reward: {:.2f}'.format(i, total_reward)) sac.save('sac_model') ``` 请注意,这只是一个基本的实现示例,其中有许多可以进行改进和优化的方面。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值