强化学习 Reinforcement Learning(四)——如何使用百度 PARL 框架通关马里奥兄弟
文章目录
马里奥环境介绍
环境安装
推荐使用 pip 进行安装
pip install gym-super-mario-bros
在尝试创造一个环境之前需要 import gym_super_mario_bros
默认情况下,gym_super_mario_bros 环境使用了包含 256 个独立动作的完整NES动作空间。
为了避免这种情况,gym_super_mario_bros.actions
只为nes_py.wrappers.JoypadSpace
提供三个操作列表(仅右键移动、简单移动和复杂移动)。
环境测试
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
done = True
for step in range(5000):
if done:
state = env.reset()
state, reward, done, info = env.step(env.action_space.sample())
env.render()
env.close()
环境设置
Environment | Game | ROM | Screenshot |
---|---|---|---|
SuperMarioBros-v0 | SMB | standard | |
SuperMarioBros-v1 | SMB | downsample | |
SuperMarioBros-v2 | SMB | pixel | |
SuperMarioBros-v3 | SMB | rectangle | |
SuperMarioBros2-v0 | SMB2 | standard | |
SuperMarioBros2-v1 | SMB2 | downsample |
Key | Type | Description |
---|---|---|
coins | int | The number of collected coins |
flag_get | bool | True if Mario reached a flag or ax |
life | int | The number of lives left, i.e., {3, 2, 1} |
score | int | The cumulative in-game score |
stage | int | The current stage, i.e., {1, ..., 4} |
status | str | Mario's status, i.e., {'small', 'tall', 'fireball'} |
time | int | The time left on the clock |
world | int | The current world, i.e., {1, ..., 8} |
x_pos | int | Mario's x position in the stage (from the left) |
y_pos | int | Mario's y position in the stage (from the bottom) |
如何在百度 aistudio 在线部署环境配置
!pip uninstall -y parl # 说明:卸载预装parl版本
!pip uninstall -y pandas scikit-learn # 提示:在AIStudio中卸载这两个库再import parl可避免warning提示,不卸载也不影响parl的使用
!pip install paddlepaddle-gpu==1.6.3.post97 -i https://mirror.baidu.com/pypi/simple
!pip install parl==1.3.1
!pip install gym
!pip install box2d-py
!pip install gym-super-mario-bros
好啦,配置好环境,贴上代码就可以训练啦
完整代码
导入相关库
# -*- coding:utf-8 -*-
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
import cv2 as cv
import random
import copy
import time
import numpy as np
import parl
from parl import layers
import paddle.fluid as fluid
from parl.utils import logger
import collections
import matplotlib.pyplot as plt
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
超参数设置
LEARN_FREQ = 100 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率
MEMORY_SIZE = 2000 # replay memory的大小,越大越占用内存
MEMORY_WARMUP_SIZE = 1000 # replay_memory 里需要预存一些经验数据,再开启训练
BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来
LEARNING_RATE = 0.001 # 学习率
GAMMA = 0.95 # reward 的衰减因子,一般取 0.9 到 0.999 不等
max_episode = 2000000
模型定义部分
由于超级马里奥的关卡基本是静态的,所以这里直接采用了结构最简单的卷积神经网络
# Model
class Model(parl.Model):
def __init__(self, num_actions):
# define the hyperparameters of the CNN
# filter size
self.filter_size = 5
# number of filters
self.num_filters = [16, 32, 64]
# stride size
self.stride = 2
# pool size
self.poolsize = 2
self.rnn_hidden_size = 128
# drop out probability
self.dropout_probability = [0.3, 0.2]
self.act_dim = num_actions
def value(self, obs):
# first convolutional layer
self.conv1 = fluid.layers.conv2d(obs,
num_filters = self.num_filters[0],
filter_size = self.filter_size,
stride = self.stride,
act='relu')
self.pool1 = fluid.layers.pool2d(self.conv1,
pool_size = self.poolsize,
pool_type = "max",
pool_stride = self.stride)
# second convolutional layer
self.conv2 = fluid.layers.conv2d_transpose(self.pool1,
num_filters = self.num_filters[1],
filter_size = self.filter_size,
stride = self.stride,
act='relu')
self.pool2 = fluid.layers.pool2d(self.conv2,
pool_size = self.poolsize,
pool_type = "max",
pool_stride = self.stride)
# third convolutional layer
self.conv3 = fluid.layers.conv2d(self.pool2,
num_filters = self.num_filters[1],
filter_size = self.filter_size,
stride = self.stride,
act='relu')
self.pool3 = fluid.layers.pool2d(self.conv3,
pool_size = self.poolsize,
pool_type = "max",
pool_stride = self.stride)
self.fc1 = fluid.layers.fc(self.pool3, size = self.rnn_hidden_size*3, act="relu")
self.drop1 = fluid.layers.dropout(self.fc1, dropout_prob = self.dropout_probability[0])
self.fc2 = fluid.layers.fc(self.drop1, size = self.rnn_hidden_size*3, act="sigmoid")
self.drop2= fluid.layers.dropout(self.fc2, dropout_prob = self.dropout_probability[1])
self.prediction = fluid.layers.fc(self.drop2, size = self.act_dim)
return self.prediction
DQN 算法
# Algorithm
class DQN(parl.Algorithm):
def __init__(self, model, act_dim=None, gamma=None, lr=None):
"""
Args:
model (parl.Model): 定义Q函数的前向网络结构
act_dim (int): action空间的维度,即有几个action
gamma (float): reward的衰减因子
lr (float): learning rate 学习率.
"""
self.model = model
self.target_model = copy.deepcopy(model)
assert isinstance(act_dim, int)
assert isinstance(gamma, float)
assert isinstance(lr, float)
self.act_dim = act_dim
self.gamma = gamma
self.lr = lr
def predict(self, obs):
"""
使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...]
"""
return self.model.value(obs)
def learn(self, obs, action, reward, next_obs, terminal):
"""
使用DQN算法更新self.model的value网络
"""
# 从target_model中获取 max Q' 的值,用于计算target_Q
next_pred_value = self.target_model.value(next_obs)
best_v = layers.reduce_max(next_pred_value, dim=1)
best_v.stop_gradient = True # 阻止梯度传递
terminal = layers.cast(terminal, dtype='float32')
target = reward + (1.0 - terminal) * self.gamma * best_v
pred_value = self.model.value(obs) # 获取Q预测值
# 将action转onehot向量,比如:3 => [0,0,0,1,0]
action_onehot = layers.one_hot(action, self.act_dim)
action_onehot = layers.cast(action_onehot, dtype='float32')
# 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
# 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
# ==> pred_action_value = [[3.9]]
pred_action_value = layers.reduce_sum(
layers.elementwise_mul(action_onehot, pred_value), dim=1)
# 计算 Q(s,a) 与 target_Q的均方差,得到loss
cost = layers.square_error_cost(pred_action_value, target)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器
optimizer.minimize(cost)
return cost
def sync_target(self):
"""
把 self.model 的模型参数值同步到 self.target_model
"""
self.model.sync_weights_to(self.target_model)
创建智能体
# Agent
class Agent(parl.Agent):
def __init__(self,
algorithm,
obs_dim,
act_dim,
e_greed=0.1,
e_greed_decrement=0):
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)
self.global_step = 0
self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中
self.e_greed = e_greed # 有一定概率随机选取动作,探索
self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低
def build_program(self):
self.rnn_hidden_size = 128
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量
obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32',lod_level = 1)
self.value = self.alg.predict(obs)
with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量
obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32',lod_level = 1)
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(name='next_obs', shape=self.obs_dim, dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)
def sample(self, obs):
sample = np.random.rand() # 产生0~1之间的小数
if sample < self.e_greed:
act = np.random.randint(self.act_dim) # 探索:每个动作都有概率被选择
else:
act = self.predict(obs) # 选择最优动作
self.e_greed = max(
0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低
return act
def predict(self, obs): # 选择最优动作
obs = np.array(obs)
obs = obs.transpose(2 , 0 , 1) #NCHW
obs = np.expand_dims(obs, axis=0)
pred_Q = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.value],
return_numpy=False)[0]
pred_Q = np.squeeze(pred_Q, axis=0)
act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作
return act
def learn(self, obs, act, reward, next_obs, terminal):
# 每隔200个training steps同步一次model和target_model的参数
if self.global_step % self.update_target_steps == 0:
self.alg.sync_target()
self.global_step += 1
obs = np.array(obs)
obs = obs.transpose(2 , 0 , 1) #NCHW
obs = np.expand_dims(obs, axis=0)
next_obs = np.array(next_obs)
next_obs = next_obs.transpose(2 , 0 , 1) #NCHW
next_obs = np.expand_dims(next_obs, axis=0)
act = np.expand_dims(act, -1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int32'),
'reward': reward,
'next_obs': next_obs.astype('float32'),
'terminal': terminal
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络
return cost
经验回放部分
class ReplayMemory(object):
def __init__(self, max_size):
self.buffer = collections.deque(maxlen=max_size)
# 增加一条经验到经验池中
def append(self, exp):
self.buffer.append(exp)
# 从经验池中选取N条经验出来
def sample(self, batch_size):
mini_batch = random.sample(self.buffer, batch_size)
obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
for experience in mini_batch:
s, a, r, s_p, done = experience
obs_batch.append(s)
action_batch.append(a)
reward_batch.append(r)
next_obs_batch.append(s_p)
done_batch.append(done)
return np.array(obs_batch[0]).astype('float32'), \
np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
np.array(next_obs_batch[0]).astype('float32'), np.array(done_batch).astype('float32')
def __len__(self):
return len(self.buffer)
训练函数与评估函数
# 训练一个episode
def run_episode(env, agent, rpm, render=False):
total_reward = 0
obs = env.reset()
step = 0
pass_flag = 1
while True:
step += 1
action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到
next_obs, reward, done, info = env.step(action)
rpm.append((obs, action, reward, next_obs, done))
if render:
env.render() # train model
if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
(batch_obs, batch_action, batch_reward, batch_next_obs,
batch_done) = rpm.sample(BATCH_SIZE)
train_loss = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs,
batch_done) # s,a,r,s',done
total_reward += reward
obs = next_obs
if done:
break
if info['world'] > pass_flag:
pass_flag = info['world']
print("\nPASS!!!")
return total_reward
# 评估 agent, 跑 5 个episode,总reward求平均
def evaluate(env, agent, render=False):
eval_reward = []
pass_flag = 1
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
action = agent.predict(obs) # 预测动作,只选最优动作
obs, reward, done, info = env.step(action)
episode_reward += reward
if render:
env.render()
if done:
break
if info['world'] > pass_flag:
pass_flag = info['world']
print("\nPASS!!!")
print(info)
eval_reward.append(episode_reward)
return np.mean(eval_reward)
游戏环境创建,这里选择的是超级马里奥兄弟2
env = gym_super_mario_bros.make('SuperMarioBros2-v1')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
action_dim = env.action_space.n
obs_shape = [3,240,256]
rpm = ReplayMemory(MEMORY_SIZE)
# 根据parl框架构建agent
model = Model(num_actions=action_dim)
algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
agent = Agent(
algorithm,
obs_dim=obs_shape,
act_dim=action_dim,
e_greed=0.1, # 有一定概率随机选取动作,探索
e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低
# 加载模型
# save_path = './DRQN_DOOM.ckpt'
# agent.restore(save_path)
# 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
print("filling memory...")
while len(rpm) < MEMORY_WARMUP_SIZE:
run_episode(env, agent, rpm)
开始训练
# 开始训练
train_step_list = []
train_reward_list = []
evaluate_step_list = []
evaluate_reward_list = []
npy_path = 'npy/'
episode = 0
print("start training...")
while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量
# train part
for i in range(0, 50):
total_reward = run_episode(env, agent, rpm)
episode += 1
print("episode:"+str(episode)+" reward:"+str(total_reward))
train_step_list.append(episode)
train_reward_list.append(total_reward)
# test part
print("start evaluation...")
eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果
logger.info('episode:{} e_greed:{} test_reward:{}'.format(
episode, agent.e_greed, eval_reward))
evaluate_step_list.append(episode)
evaluate_reward_list.append(eval_reward)
np.save(npy_path + 'train_step_list', train_step_list)
np.save(npy_path + 'train_reward_list', train_reward_list)
np.save(npy_path + 'evaluate_step_list', evaluate_step_list)
np.save(npy_path + 'evaluate_reward_list', evaluate_reward_list)
save_path = './DQN_MARIO.ckpt'
agent.save(save_path)
# 训练结束,保存模型
save_path = './DQN_MARIO.ckpt'
agent.save(save_path)
env.close()
训练完成后,展示训练过程
train_step_list = np.load('train_step_list.npy')
train_reward_list = np.load('train_reward_list.npy')
plt.figure()
plt.title('train reward')
plt.xlabel('step')
plt.ylabel('reward')
plt.plot(train_step_list, train_reward_list)
plt.grid()
plt.show()
evaluate_step_list = np.load('evaluate_step_list.npy')
evaluate_reward_list = np.load('evaluate_reward_list.npy')
plt.figure()
plt.title('evaluate reward')
plt.xlabel('step')
plt.ylabel('reward')
plt.plot(evaluate_step_list, evaluate_reward_list)
plt.grid()
plt.show()
写在最后
快把代码跑起来,看看你的模型什么时候能通关吧!!!(🐘叫)