【强化学习】基于tensorflow2.2实现A3C训练火箭着陆器LunarLander-v2

13 篇文章 0 订阅
11 篇文章 0 订阅

本来是想训练SpaceInvaders-v4来着,结果电脑train不起来,就换成LunarLander-v2了

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import threading
import gym

episodes = 2000
gamma = 0.9
learning_rate = 1e-3
num_workers = 3

game = 'LunarLander-v2'
state_shape = (None, 8)
num_actions = 4

# game = 'SpaceInvaders-v4'
# state_shape = (None, 210, 160, 3)
# num_actions = 6

EPISODE = 0

class CNNModel(tf.keras.models.Model):
    def __init__(self, num_actions):
        super(CNNModel, self).__init__()
        # self.conv1 = tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, padding='SAME')
        # self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=2)
        #
        # self.conv2 = tf.keras.layers.Conv2D(filters=32, kernel_size=3, strides=1, padding='SAME')
        # self.pool2 = tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=2)
        #
        # self.conv3 = tf.keras.layers.Conv2D(filters=16, kernel_size=3, strides=1, padding='SAME')
        # self.pool3 = tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=2)
        #
        # self.flatten = tf.keras.layers.Flatten()
        self.linear1 = tf.keras.layers.Dense(units=128, activation='relu')
        self.linear2 = tf.keras.layers.Dense(units=128, activation='relu')
        self.actor_linear = tf.keras.layers.Dense(units=num_actions, activation='linear')
        self.critic_linear = tf.keras.layers.Dense(units=1, activation='linear')


    def call(self, inputs, training=None, mask=None):
        # x = self.conv1(inputs)
        # x = self.pool1(x)
        # x = self.conv2(x)
        # x = self.pool2(x)
        # x = self.conv3(x)
        # x = self.pool3(x)
        # x = self.flatten(x)
        x = self.linear1(inputs)
        x = self.linear2(x)
        return self.actor_linear(x), self.critic_linear(x)

def run_worker(global_model, global_optimizer):
    global EPISODE
    global rewards_plt
    global losses_plt
    env = gym.make(game)
    model = CNNModel(num_actions=num_actions)
    model.build(input_shape=state_shape)

    while EPISODE < episodes:
        all_reward = 0
        states, values, act_probs, actions, rewards = [], [], [], [], []

        state = np.array([env.reset()], dtype=np.float)

        model.set_weights(global_model.get_weights())

        for steps in range(5000):
            states.append(state[0])
            act_prob, value = model(state)
            act_probs.append(act_prob[0])
            values.append(value[0])
            policy = tf.nn.softmax(act_prob)
            action = np.random.choice(num_actions, p=np.array(policy[0]))
            actions.append(action)
            state, reward, done, _ = env.step(action)

            rewards.append(reward)

            all_reward += reward
            state = np.array([state], dtype=np.float)
            if done: break

        rewards_plt.append(all_reward)

        with tf.GradientTape() as tape:
            _values = []
            for i in range(len(values) - 1):
                _values.append([rewards[i] + gamma * values[i+1][0]])
            _values.append([rewards[-1]])
            advantages = np.array(_values) - np.array(values)
            advantages = np.reshape(advantages, newshape=(-1))

            actions_onehot = np.eye(num_actions)[actions]
            act_prob, value = model(np.array(states, dtype=np.float))
            policy = tf.nn.softmax(act_prob)

            losses = advantages * tf.nn.softmax_cross_entropy_with_logits(labels=actions_onehot, logits=act_prob) + \
                     0.5 * tf.reshape((value - _values) ** 2, shape=(-1)) + \
                     0.01 * tf.reduce_mean(policy * tf.math.log(policy + 1e-20), axis=-1)

            grad = tape.gradient(tf.reduce_mean(losses), model.trainable_variables)
            global_optimizer.apply_gradients(zip(grad, global_model.trainable_variables))

            print('episode"{}"; rewards"{}"; losses"{}"'.format(EPISODE + 1, all_reward, tf.reduce_mean(losses)))
        losses_plt.append(tf.reduce_mean(losses))
        EPISODE += 1

if __name__ == '__main__':
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.compat.v1.InteractiveSession(config=config)

    global_model = CNNModel(num_actions=num_actions)
    global_model.build(input_shape=state_shape)
    global_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    rewards_plt = []
    losses_plt = []

    threads = []
    for _ in range(num_workers):
        p = threading.Thread(target=run_worker, args=[global_model, global_optimizer])
        p.start()
        threads.append(p)
    for p in threads: p.join()

    global_model.save_weights('./A3C_LunarLander_2e3_epochs.h5')
    plt.plot(rewards_plt)
    plt.show()
    plt.plot(losses_plt)
    plt.show()

训练2000轮的奖励变化
在这里插入图片描述
训练2000轮的loss变化
在这里插入图片描述

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值