【强化学习】tensorflow2.x PPO-Clip with GAE 训练 agent 玩 Atari

13 篇文章 0 订阅
12 篇文章 0 订阅

算法流程

在这里插入图片描述

代码

import tensorflow as tf
import numpy as np

import gym
import copy

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)


def build_model(action_dim, l2=1e-4):
    feature_input = tf.keras.Input(shape=(210, 160, 3))
    x = tf.keras.layers.Conv2D(64, 7, 3, padding='same', use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l2))(feature_input)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)

    x = tf.keras.layers.Conv2D(128, 7, 3, padding='same', use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)

    x = tf.keras.layers.Conv2D(256, 7, 3, padding='same', use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)

    x = tf.keras.layers.Conv2D(512, 5, 2, padding='same', use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)

    x = tf.keras.layers.Conv2D(1024, 5, 2, padding='same', use_bias=False,
                               kernel_regularizer=tf.keras.regularizers.l2(l2))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)

    feature_output = tf.keras.layers.GlobalAvgPool2D()(x)

    x_a = tf.keras.layers.Dense(units=512, activation='relu',
                                kernel_regularizer=tf.keras.regularizers.l2(l2))(feature_output)
    x_a = tf.keras.layers.Dense(units=512, activation='relu',
                                kernel_regularizer=tf.keras.regularizers.l2(l2))(x_a)
    actor_output = tf.keras.layers.Dense(units=action_dim, activation='softmax',
                                         kernel_regularizer=tf.keras.regularizers.l2(l2))(x_a)

    x_c = tf.keras.layers.Dense(units=512, activation='relu',
                                kernel_regularizer=tf.keras.regularizers.l2(l2))(feature_output)
    x_c = tf.keras.layers.Dense(units=512, activation='relu',
                                kernel_regularizer=tf.keras.regularizers.l2(l2))(x_c)
    critic_output = tf.keras.layers.Dense(units=1, activation='linear',
                                          kernel_regularizer=tf.keras.regularizers.l2(l2))(x_c)

    AC_model = tf.keras.Model(inputs=feature_input, outputs=[actor_output, critic_output])
    AC_model.summary()
    return AC_model

def choice_action(model, state):
    policy, value = model(np.array([state], dtype=np.float))
    policy = tf.stop_gradient(policy).numpy()
    return np.random.choice(
        len(policy[0]),
        p=policy[0]
    ), policy[0]

def get_advantage(model, state, reward, batch_size):
    policy, value = model.predict(state, batch_size=batch_size, verbose=0)
    return reward - value

def get_value(model, state, batch_size):
    policy, value = model.predict(state, batch_size=batch_size, verbose=0)
    return value

def PPO_train(model, optimizer, batch_state, batch_action, batch_policy, advantage, batch_raward, K_epochs, batch_size,
              epsilon=0.2, T=1e-2):
    losses = 0.0
    ent = 0.0

    advantage = np.reshape(advantage, newshape=(-1))
    batch_action = tf.stack([tf.range(tf.shape(batch_action)[0], dtype=tf.int32), batch_action], axis=1)
    data_length = len(state)
    mini_epo = int(data_length / batch_size) + 1
    for _ in range(K_epochs):
        for i in range(mini_epo):
            start = i * batch_size
            end = min((i + 1) * batch_size, data_length)
            b_s = batch_state[start:end]
            b_a = batch_action[start:end]
            b_p = batch_policy[start:end]
            adv = advantage[start:end]
            b_r = batch_raward[start:end]
            with tf.GradientTape() as tape:
                new_policy, value_predict = model(b_s)
                pi_prob = tf.gather_nd(params=new_policy, indices=b_a)
                oldpi_prob = tf.gather_nd(params=b_p, indices=b_a)
                entropy = - tf.reduce_sum(new_policy * tf.math.log(new_policy + 1e-6), axis=-1)
                ratio = pi_prob / (oldpi_prob + 1e-6)
                surr1 = ratio * adv
                surr2 = tf.clip_by_value(ratio, clip_value_min=1.0 - epsilon, clip_value_max=1.0 + epsilon) * adv
                loss = - tf.minimum(surr1, surr2) - T * entropy + tf.keras.losses.mean_squared_error(b_r, value_predict)
            grad = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grad, model.trainable_variables))
            losses += np.mean(loss.numpy())
            ent += np.mean(entropy.numpy())

    return losses / (K_epoch * mini_epo), ent / (K_epochs * mini_epo)


if __name__ == '__main__':
    episodes = 1000000
    env_name = "ALE/SpaceInvaders-v5"
    # env_name = "ALE/Breakout-v5"
    # env_name = "ALE/Tennis-v5"

    render = False
    full_action_space = False
    training = True

    if render:
        env = gym.make(env_name, render_mode="human", full_action_space=full_action_space)
    else:
        env = gym.make(env_name, render_mode="rgb_array", full_action_space=full_action_space)
    env.seed(0)

    action_dim = env.action_space.n
    learning_rate = 2.5 * 1e-4
    batch_size = 256

    model = build_model(action_dim=action_dim)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    try:
        print("load models at ./{}".format(env_name))
        model.load_weights("./{}.h5".format(env_name))
        print("load model success!")

    except:
        print("No checkpoint, Init new model!")

    print("Action Space dimension: {}".format(action_dim))

    gamma = 0.99

    lam = 0.95
    assert lam >= 0.0 and lam <= 1.0, "lammbda必须介于[0, 1]"
    K_epoch = 8
    assert K_epoch > 1, "K_epoch必须大于1,不然计算的重要性采样没有意义"
    epsilon = 0.2
    assert epsilon > 0.0 and epsilon < 1.0, "epsilon必须介于(0, 1)"
    temperature = 1e-2

    if training:
        for e in range(episodes):
            state = env.reset()

            S, A, P, R, nS = [], [], [], [], []
            score = 0.0

            while True:
                action, policy = choice_action(model, state)
                next_state, reward, done, info = env.step(action)
                score += reward

                S.append(state)
                A.append(action)
                P.append(policy)
                R.append(reward)
                nS.append(next_state)
                state = copy.deepcopy(next_state)

                if done:
                    discounted_r = []
                    tmp_r = 0.0

                    v_nS = get_value(model, np.array(nS, dtype=np.float), batch_size)
                    v_nS[-1] = 0
                    for r, vs in zip(R[::-1], v_nS[::-1]):
                        tmp_r = r + gamma * (lam * tmp_r + (1.0 - lam) * vs[0])
                        discounted_r.append(np.array([tmp_r]))
                    discounted_r.reverse()
                    bs = np.array(S, dtype=np.float)
                    ba = np.array(A)
                    bp = np.array(P)
                    br = np.array(discounted_r, dtype=np.float)

                    advantage = get_advantage(model, bs, br, batch_size)
                    advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-6)

                    losses, entropy = PPO_train(
                        model=model,
                        optimizer=optimizer,
                        batch_state=bs,
                        batch_action=ba,
                        batch_policy=bp,
                        advantage=advantage,
                        batch_raward=br,
                        K_epochs=K_epoch,
                        batch_size=batch_size,
                        epsilon=epsilon,
                        T=temperature
                    )

                    print("episode: {}/{}, score: {}, losses: {:.4f}, entropy: {:.4f}".format(
                        e + 1, episodes, score, losses, entropy))
                    break

            if (e + 1) % 50 == 0:
                model.save_weights("./{}.h5".format(env_name))
                print("Model Saved!")

    else:
        env = gym.make(env_name, render_mode="human", full_action_space=full_action_space)
        env.seed(0)
        state = env.reset()
        while True:
            action, _ = choice_action(model, state)
            next_state, reward, done, info = env.step(action)
            state = copy.deepcopy(next_state)
            if done: break

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值