强化学习 Reinforcement Learning(四)——如何使用百度 PARL 框架通关马里奥兄弟

强化学习 Reinforcement Learning(四)——如何使用百度 PARL 框架通关马里奥兄弟

在这里插入图片描述

马里奥环境介绍

环境安装

推荐使用 pip 进行安装

pip install gym-super-mario-bros

在尝试创造一个环境之前需要 import gym_super_mario_bros

默认情况下,gym_super_mario_bros 环境使用了包含 256 个独立动作的完整NES动作空间。

为了避免这种情况,gym_super_mario_bros.actions只为nes_py.wrappers.JoypadSpace提供三个操作列表(仅右键移动、简单移动和复杂移动)。

环境测试

from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

done = True
for step in range(5000):
    if done:
        state = env.reset()
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()

env.close()

环境设置

EnvironmentGameROMScreenshot
SuperMarioBros-v0SMBstandard
SuperMarioBros-v1SMBdownsample
SuperMarioBros-v2SMBpixel
SuperMarioBros-v3SMBrectangle
SuperMarioBros2-v0SMB2standard
SuperMarioBros2-v1SMB2downsample
KeyTypeDescription
coinsintThe number of collected coins
flag_getboolTrue if Mario reached a flag or ax
lifeintThe number of lives left, i.e., {3, 2, 1}
scoreintThe cumulative in-game score
stageintThe current stage, i.e., {1, ..., 4}
statusstrMario's status, i.e., {'small', 'tall', 'fireball'}
timeintThe time left on the clock
worldintThe current world, i.e., {1, ..., 8}
x_posintMario's x position in the stage (from the left)
y_posintMario's y position in the stage (from the bottom)

如何在百度 aistudio 在线部署环境配置

!pip uninstall -y parl  # 说明:卸载预装parl版本
!pip uninstall -y pandas scikit-learn # 提示:在AIStudio中卸载这两个库再import parl可避免warning提示,不卸载也不影响parl的使用

!pip install paddlepaddle-gpu==1.6.3.post97 -i https://mirror.baidu.com/pypi/simple
!pip install parl==1.3.1
!pip install gym
!pip install box2d-py
!pip install gym-super-mario-bros

好啦,配置好环境,贴上代码就可以训练啦
在这里插入图片描述

完整代码

导入相关库

# -*- coding:utf-8 -*-
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
import cv2 as cv
import random
import copy
import time
import numpy as np
import parl
from parl import layers
import paddle.fluid as fluid
from parl.utils import logger
import collections
import matplotlib.pyplot as plt
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

超参数设置

LEARN_FREQ = 100 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率
MEMORY_SIZE = 2000    # replay memory的大小,越大越占用内存
MEMORY_WARMUP_SIZE = 1000  # replay_memory 里需要预存一些经验数据,再开启训练
BATCH_SIZE = 32   # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来
LEARNING_RATE = 0.001 # 学习率
GAMMA = 0.95 # reward 的衰减因子,一般取 0.9 到 0.999 不等
max_episode = 2000000

模型定义部分

由于超级马里奥的关卡基本是静态的,所以这里直接采用了结构最简单的卷积神经网络

# Model
class Model(parl.Model):

    def __init__(self, num_actions):
        # define the hyperparameters of the CNN
        # filter size
        self.filter_size = 5
        # number of filters
        self.num_filters = [16, 32, 64]
        # stride size
        self.stride = 2
        # pool size
        self.poolsize = 2
        self.rnn_hidden_size = 128
        # drop out probability
        self.dropout_probability = [0.3, 0.2]
        self.act_dim = num_actions

    def value(self, obs):
        # first convolutional layer
        self.conv1 = fluid.layers.conv2d(obs,
                                        num_filters = self.num_filters[0],
                                        filter_size = self.filter_size,
                                        stride = self.stride,
                                        act='relu')
        self.pool1 = fluid.layers.pool2d(self.conv1,
                                         pool_size = self.poolsize,
                                         pool_type = "max",
                                         pool_stride = self.stride)
        # second convolutional layer
        self.conv2 = fluid.layers.conv2d_transpose(self.pool1,
                                        num_filters = self.num_filters[1],
                                        filter_size = self.filter_size,
                                        stride = self.stride,
                                        act='relu')
        self.pool2 = fluid.layers.pool2d(self.conv2,
                                         pool_size = self.poolsize,
                                         pool_type = "max",
                                         pool_stride = self.stride)
        # third convolutional layer
        self.conv3 = fluid.layers.conv2d(self.pool2,
                                            num_filters = self.num_filters[1],
                                            filter_size = self.filter_size,
                                            stride = self.stride,
                                            act='relu')
        self.pool3 = fluid.layers.pool2d(self.conv3,
                                         pool_size = self.poolsize,
                                         pool_type = "max",
                                         pool_stride = self.stride)

        self.fc1 = fluid.layers.fc(self.pool3, size = self.rnn_hidden_size*3, act="relu")
        self.drop1 = fluid.layers.dropout(self.fc1, dropout_prob = self.dropout_probability[0])
        self.fc2 = fluid.layers.fc(self.drop1, size = self.rnn_hidden_size*3, act="sigmoid")
        self.drop2= fluid.layers.dropout(self.fc2, dropout_prob = self.dropout_probability[1])
        self.prediction = fluid.layers.fc(self.drop2, size = self.act_dim)
        return self.prediction

DQN 算法

# Algorithm
class DQN(parl.Algorithm):

    def __init__(self, model, act_dim=None, gamma=None,  lr=None):
        """
        Args:
            model (parl.Model): 定义Q函数的前向网络结构
            act_dim (int): action空间的维度,即有几个action
            gamma (float): reward的衰减因子
            lr (float): learning rate 学习率.
        """
        self.model = model
        self.target_model = copy.deepcopy(model)

        assert isinstance(act_dim, int)
        assert isinstance(gamma, float)
        assert isinstance(lr, float)
        self.act_dim = act_dim
        self.gamma = gamma
        self.lr = lr

    def predict(self, obs):
        """
        使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...]
        """
        return self.model.value(obs)

    def learn(self, obs, action, reward, next_obs, terminal):
        """
        使用DQN算法更新self.model的value网络
        """
        # 从target_model中获取 max Q' 的值,用于计算target_Q
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True  # 阻止梯度传递
        terminal = layers.cast(terminal, dtype='float32')
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)  # 获取Q预测值
        # 将action转onehot向量,比如:3 => [0,0,0,1,0]
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        # 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
        # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
        #  ==> pred_action_value = [[3.9]]
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)

        # 计算 Q(s,a) 与 target_Q的均方差,得到loss
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)  # 使用Adam优化器
        optimizer.minimize(cost)
        return cost

    def sync_target(self):
        """
        把 self.model 的模型参数值同步到 self.target_model
        """
        self.model.sync_weights_to(self.target_model)

创建智能体

# Agent
class Agent(parl.Agent):
    def __init__(self,
                 algorithm,
                 obs_dim,
                 act_dim,
                 e_greed=0.1,
                 e_greed_decrement=0):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(Agent, self).__init__(algorithm)

        self.global_step = 0
        self.update_target_steps = 200  # 每隔200个training steps再把model的参数复制到target_model中

        self.e_greed = e_greed  # 有一定概率随机选取动作,探索
        self.e_greed_decrement = e_greed_decrement  # 随着训练逐步收敛,探索的程度慢慢降低

    def build_program(self):
        self.rnn_hidden_size = 128
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作,定义输入输出变量
            obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32',lod_level = 1)
            self.value = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):  # 搭建计算图用于 更新Q网络,定义输入输出变量
            obs = layers.data(name='obs', shape=self.obs_dim, dtype='float32',lod_level = 1)
            action = layers.data(name='act', shape=[1], dtype='int32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(name='next_obs', shape=self.obs_dim, dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)

    def sample(self, obs):
        sample = np.random.rand()  # 产生0~1之间的小数
        if sample < self.e_greed:
            act = np.random.randint(self.act_dim)  # 探索:每个动作都有概率被选择
        else:
            act = self.predict(obs)  # 选择最优动作
        self.e_greed = max(
            0.01, self.e_greed - self.e_greed_decrement)  # 随着训练逐步收敛,探索的程度慢慢降低
        return act

    def predict(self, obs):  # 选择最优动作
        obs = np.array(obs)
        obs = obs.transpose(2 , 0 , 1) #NCHW
        obs = np.expand_dims(obs, axis=0)
        pred_Q = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.value],
            return_numpy=False)[0]
        pred_Q = np.squeeze(pred_Q, axis=0)
        act = np.argmax(pred_Q)  # 选择Q最大的下标,即对应的动作
        return act

    def learn(self, obs, act, reward, next_obs, terminal):
        # 每隔200个training steps同步一次model和target_model的参数
        if self.global_step % self.update_target_steps == 0:
            self.alg.sync_target()
        self.global_step += 1
        obs = np.array(obs)
        obs = obs.transpose(2 , 0 , 1) #NCHW   
        obs = np.expand_dims(obs, axis=0)
        next_obs = np.array(next_obs)
        next_obs = next_obs.transpose(2 , 0 , 1) #NCHW
        next_obs = np.expand_dims(next_obs, axis=0)

        act = np.expand_dims(act, -1)

        feed = {
            'obs': obs.astype('float32'),
            'act': act.astype('int32'),
            'reward': reward,
            'next_obs': next_obs.astype('float32'),
            'terminal': terminal
        }
        cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.cost])[0]  # 训练一次网络
        return cost

经验回放部分

class ReplayMemory(object):
    def __init__(self, max_size):
        self.buffer = collections.deque(maxlen=max_size)

    # 增加一条经验到经验池中
    def append(self, exp):
        self.buffer.append(exp)

    # 从经验池中选取N条经验出来
    def sample(self, batch_size):
        mini_batch = random.sample(self.buffer, batch_size)
        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []

        for experience in mini_batch:
            s, a, r, s_p, done = experience
            obs_batch.append(s)
            action_batch.append(a)
            reward_batch.append(r)
            next_obs_batch.append(s_p)
            done_batch.append(done)

        return np.array(obs_batch[0]).astype('float32'), \
            np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
            np.array(next_obs_batch[0]).astype('float32'), np.array(done_batch).astype('float32')

    def __len__(self):
        return len(self.buffer)

训练函数与评估函数

# 训练一个episode
def run_episode(env, agent, rpm, render=False):
    total_reward = 0
    obs = env.reset()
    step = 0
    pass_flag = 1
    while True:

        step += 1
        action = agent.sample(obs)  # 采样动作,所有动作都有概率被尝试到
        next_obs, reward, done, info = env.step(action)
        rpm.append((obs, action, reward, next_obs, done))
        if render:       
            env.render() # train model
        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
            (batch_obs, batch_action, batch_reward, batch_next_obs,
             batch_done) = rpm.sample(BATCH_SIZE)
            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
                                     batch_next_obs,
                                     batch_done)  # s,a,r,s',done

        total_reward += reward
        obs = next_obs
        if done:
            break
        if info['world'] > pass_flag:
            pass_flag = info['world'] 
            print("\nPASS!!!")
    return total_reward


# 评估 agent, 跑 5 个episode,总reward求平均
def evaluate(env, agent, render=False):
    eval_reward = []
    pass_flag = 1
    for i in range(5):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs)  # 预测动作,只选最优动作
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if done:
                break
            if info['world'] > pass_flag:
                pass_flag = info['world'] 
                print("\nPASS!!!")
            print(info)   
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)

游戏环境创建,这里选择的是超级马里奥兄弟2

env = gym_super_mario_bros.make('SuperMarioBros2-v1')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
action_dim = env.action_space.n
obs_shape = [3,240,256]
rpm = ReplayMemory(MEMORY_SIZE)


# 根据parl框架构建agent
model = Model(num_actions=action_dim)
algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
agent = Agent(
    algorithm,
    obs_dim=obs_shape,
    act_dim=action_dim,
    e_greed=0.1,  # 有一定概率随机选取动作,探索
    e_greed_decrement=1e-6)  # 随着训练逐步收敛,探索的程度慢慢降低

# 加载模型
# save_path = './DRQN_DOOM.ckpt'
# agent.restore(save_path)

# 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
print("filling memory...")
while len(rpm) < MEMORY_WARMUP_SIZE:
    run_episode(env, agent, rpm)

开始训练


# 开始训练
train_step_list = []
train_reward_list = []
evaluate_step_list = []
evaluate_reward_list = []
npy_path = 'npy/'
episode = 0
print("start training...")
while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
    # train part
    for i in range(0, 50):
        total_reward = run_episode(env, agent, rpm)
        episode += 1
        print("episode:"+str(episode)+"   reward:"+str(total_reward))
        train_step_list.append(episode)
        train_reward_list.append(total_reward)

    # test part
    print("start evaluation...")
    eval_reward = evaluate(env, agent, render=False)  # render=True 查看显示效果
    logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
        episode, agent.e_greed, eval_reward))
    evaluate_step_list.append(episode)
    evaluate_reward_list.append(eval_reward)
    np.save(npy_path + 'train_step_list', train_step_list)
    np.save(npy_path + 'train_reward_list', train_reward_list)
    np.save(npy_path + 'evaluate_step_list', evaluate_step_list)
    np.save(npy_path + 'evaluate_reward_list', evaluate_reward_list)
    save_path = './DQN_MARIO.ckpt'
    agent.save(save_path)
# 训练结束,保存模型
save_path = './DQN_MARIO.ckpt'
agent.save(save_path)
env.close()

训练完成后,展示训练过程

train_step_list = np.load('train_step_list.npy')
train_reward_list = np.load('train_reward_list.npy')

plt.figure()
plt.title('train reward')
plt.xlabel('step')
plt.ylabel('reward')
plt.plot(train_step_list, train_reward_list)
plt.grid()
plt.show()


evaluate_step_list = np.load('evaluate_step_list.npy')
evaluate_reward_list = np.load('evaluate_reward_list.npy')

plt.figure()
plt.title('evaluate reward')
plt.xlabel('step')
plt.ylabel('reward')
plt.plot(evaluate_step_list, evaluate_reward_list)
plt.grid()
plt.show()

写在最后

快把代码跑起来,看看你的模型什么时候能通关吧!!!(🐘叫)

在这里插入图片描述

  • 0
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值