本文主要用Advantage Actor Critic实现gym中的小飞船登陆的游戏。
游戏名叫“LunarLander-v2” 。
这个环境是在模拟登月小艇降落在月球表面时的情形。这个任务的目标是让登月小艇「安全地」降落在两个黄色旗帜间的平地上。
小船的状态有垂直及水平坐标、速度、角度、加速度等等
小船的操作有0:不采取任何行动。2:向下喷射。1,3:向左右喷射。
小船的相关需要的代码操作:(来自李宏毅老师的Policy Gradient实现代码中的部分)
安装小游戏需要的组件:
!apt update
!apt install python-opengl xvfb -y
!pip install gym[box2d] pyvirtualdisplay tqdm
from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()
import matplotlib.pyplot as pltfrom IPython import displayimport numpy as np
游戏的初始化:
import gym
env = gym.make('LunarLander-v2')
# 初始化状态initial_state = env.reset()print(initial_state)# 采取随机行动random_action = env.action_space.sample()
print(random_action)
# 采取行动并到达新的状态# observation:新状态# reward:采取行动的得分# 而「環境」給予的 reward 大致是這樣計算:## 小艇墜毀得到 -100 分# 小艇在黃旗幟之間成功著地則得 100~140 分# 噴射主引擎(向下噴火)每次 -0.3 分# 小艇最終完全靜止則再得 100 分# done:游戏是否结束observation, reward, done, info = env.step(random_action)
一次随机的过程:
env.reset()
img = plt.imshow(env.render(mode='rgb_array'))
done = Falsewhile not done:
action = env.action_space.sample()
observation, reward, done, _ = env.step(action)
img.set_data(env.render(mode='rgb_array'))
display.display(plt.gcf())
display.clear_output(wait=True)
A2C算法流程:(来自李宏毅老师的PPT)
下面是我的A2C的实现(参考了李宏毅老师的Policy Gradient的实现):
使用tensorflow2实现:
导入包
import gymimport tensorflow as tffrom tensorflow.keras import layers, Sequential, optimizers, Modelimport numpy as np
# 超参数GAMMA = 0.95LEARNING_RATE = 0.01EPLISON = 0.001STATE_DIM = 8ACTION_DIM = 4NUM_BATCH = 1000EPISODE_PER_BATCH = 5
Actor:
class PolicyModel(Model):def __init__(self, action_dim):super(PolicyModel, self).__init__()self.model = Sequential([
layers.Dense(16, activation='relu'), layers.Dense(16, activation='relu'), layers.Dense(action_dim, activation='softmax')
])def call(self, inputs, training=None, mask=None):
prob_weights = self.model(np.array(inputs))
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.numpy()[0])
prob = prob_weights[0][action]return action, prob
q_model 即Critic
q_model = Sequential([
layers.Dense(32, activation='relu'), layers.Dense(16, activation='relu'), layers.Dense(1, activation='linear')
])
policy_model = PolicyModel(action_dim=4)
optimizer_policy_model = optimizers.Adam(lr=LEARNING_RATE)
optimizer_q_model = optimizers.Adam(lr=LEARNING_RATE)
env = gym.make('LunarLander-v2')
训练:
avg_total_rewards, avg_final_rewards = [], []for batch in range(NUM_BATCH):
total_rewards, final_rewards = [], []# 每EPISODE_PER_BATCH更新q_model 和 policy_model with tf.GradientTape() as tape_policy_model:
loss_policy_model = 0 for epoch in range(EPISODE_PER_BATCH):
log_probs, rewards, states = [], [], []
state = env.reset()while True:
action, prob = policy_model(np.array([state]))
next_state, reward, done, _ = env.step(action)
log_probs.append(tf.math.log(prob))
rewards.append(reward)
states.append(state)
state = next_stateif done:if reward > -100:print(reward)
final_rewards.append(reward)
total_rewards.append(np.sum(rewards))break # 训练 q_model TD方式 acc_reward = np.cumsum(rewards[::-1])[::-1]with tf.GradientTape() as tape_q_model:
reward_pred = q_model(np.array(states))
loss_q_model = tf.losses.MSE(reward_pred, acc_reward)
grads_q_model = tape_q_model.gradient(loss_q_model, q_model.trainable_variables)
optimizer_q_model.apply_gradients(zip(grads_q_model, q_model.trainable_variables))
r = tf.reshape(tf.constant(rewards, dtype=tf.float32), shape=(len(rewards), 1)) - q_model(
np.array(states)) + tf.pad(q_model(np.array(states[1:])), [[0, 1], [0, 0]])
loss_policy_model += tf.reduce_sum(r * tf.reshape(log_probs, shape=(len(rewards), 1)))
loss_policy_model = -loss_policy_model / EPISODE_PER_BATCH# 更新policy model grads_policy_model = tape_policy_model.gradient(loss_policy_model, policy_model.trainable_variables)
optimizer_policy_model.apply_gradients(zip(grads_policy_model, policy_model.trainable_variables))
avg_total_reward = sum(total_rewards) / EPISODE_PER_BATCH
avg_final_reward = sum(final_rewards) / EPISODE_PER_BATCH
avg_total_rewards.append(avg_total_reward)
avg_final_rewards.append(avg_final_reward)if batch % 100 == 0:
policy_model.save_weights('weights3/' + str(batch) + 'weights_policy_model')
q_model.save_weights('weights3/' + str(batch) + 'weights_q_model')print(loss_policy_model)
np.save('avg_total_rewards', avg_total_rewards)
np.save('avg_final_rewards', avg_final_rewards)
env.close()
训练完成后让我们的acto去玩游戏:
img = plt.imshow(env.render(mode='rgb_array'))
total_reward = 0actions = []
done = Falsewhile not done:
action, prob = policy_model(np.array([state]))
state, reward, done, _ = env.step(action)
total_reward += reward
img.set_data(env.render(mode='rgb_array'))
display.display(plt.gcf())
display.clear_output(wait=True)
actions.append(action)