探索的大胜利——随机网络蒸馏(Random Network Distillation)
目录
写在前面
本项目是之前写过的一个项目项目——好奇心驱动的强化学习中鼓励探索机制(好奇心机制也是其中一种)的另一种形式,对好奇心还不是很熟悉的童鞋可以看看。
RND 有多强?
先来看一张 benchmark,环境为雅达利的游戏——蒙特祖马的复仇
可以看到一众经典算法包括 DQN、A3C、C51、ES 都考了零蛋,情况稍微好一点的算法一般是采用先进理论的算法(比如 UBE,不确定性贝尔曼方程及探索),又或者是一些分布式的训练框架(比如Ape-X,分布式优先级采样的 DQN 框架)
但是,我们的 RND 一骑绝尘,打通了所有关卡,还找到了所有隐藏房间(当然不知道训练了多久)。
RND 简介
我们的 RND 算法到底长啥样呢?这里先摆一张正经的伪代码(用于装杯)
简单来说,其实就是两个不同随机参数的网络,一个更新(称为 predictor)一个不更新(称为 target)。
target 满足的理论前提是对于不同的原始输入(比如图像),输出也要不同,也就是一对一的映射关系。
而 predictor 的目标,就是跟上 target 的脚步,如果 predictor 能跟上 target 的脚步(两个网络的输出很相似),说明 predictor 至少接受过与 target 一样的输入(也就是这个环境已经探索过了),此时内在的奖励就变少了。
举个例子,如果整张地图是一张被 target 盖住的刮刮乐,predictor 就会随着环境输入渐渐地把整个地图都刮开。
这里的内在奖励定义为 ∣ ∣ f ^ ( x ; θ ) − f ( x ) ∣ ∣ 2 ||\hat{f}(x;\theta)-f(x)||^2 ∣∣f^(x;θ)−f(x)∣∣2
我们一般称带小帽子的 f ^ \hat{f} f^为预测的值, θ \theta θ是参数,对应 predictor;剩下的 f f f自然就是 target 的输出。
举个例子,如果整张地图是一张被 target 盖住的刮刮乐,predictor 就会随着环境输入渐渐地把整个地图都刮开。
这里的内在奖励定义为 ∣ ∣ f ^ ( x ; θ ) − f ( x ) ∣ ∣ 2 ||\hat{f}(x;\theta)-f(x)||^2 ∣∣f^(x;θ)−f(x)∣∣2
我们一般称带小帽子的 f ^ \hat{f} f^为预测的值, θ \theta θ是参数,对应 predictor;剩下的 f f f自然就是 target 的输出。
RND 怎么用
了解好奇心机制的童鞋应该都知道,生成好奇心奖励的好奇心模块就是一个挂件,可以直接外接在强化学习的核心算法上。
所以,模型更新的时候记得收集一下 next state,做成一个列表。模型更新的时候按索引计算内在奖励,加在模型总损失里面就好了。
但是,一定要记得把 target 设置成 stop_gradient=True 噢!
真 · 完全体—— RND+PPO 玩马里奥
创建游戏环境
创建可供多线程的游戏环境
%%writefile MARIO/game_env.py
from __future__ import print_function
import gym_super_mario_bros
from gym.spaces import Box
from gym import Wrapper
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
import cv2
import numpy as np
import subprocess as sp
import multiprocessing as mp
class Monitor:
def __init__(self, width, height, saved_path):
self.command = ["ffmpeg", "-y", "-f", "rawvideo", "-vcodec", "rawvideo", "-s", "{}X{}".format(width, height),
"-pix_fmt", "rgb24", "-r", "60", "-i", "-", "-an", "-vcodec", "mpeg4", saved_path]
try:
'''创建子进程
'''
self.pipe = sp.Popen(self.command, stdin=sp.PIPE, stderr=sp.PIPE)
except FileNotFoundError:
pass
'''记录
'''
def record(self, image_array):
self.pipe.stdin.write(image_array.tostring())
def process_frame(frame):
if frame is not None:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (84, 84))[None, :, :] / 255.
return frame
else:
return np.zeros((1, 84, 84))
class CustomReward(Wrapper):
def __init__(self, env=None, monitor=None):
super(CustomReward, self).__init__(env)
self.observation_space = Box(low=0, high=255, shape=(1, 84, 84))
self.curr_score = 0
if monitor:
self.monitor = monitor
else:
self.monitor = None
def step(self, action):
state, reward, done, info = self.env.step(action)
if self.monitor:
self.monitor.record(state)
state = process_frame(state)
reward += (info["score"] - self.curr_score) / 40.
self.curr_score = info["score"]
if done:
if info["flag_get"]:
reward += 50
reward += info["time"]
else:
reward -= 50
return state, reward / 10., done, info
def reset(self):
self.curr_score = 0
return process_frame(self.env.reset())
class CustomSkipFrame(Wrapper):
def __init__(self, env, skip=4):
super(CustomSkipFrame, self).__init__(env)
self.observation_space = Box(low=0, high=255, shape=(skip, 84, 84))
self.skip = skip
self.states = np.zeros((skip, 84, 84), dtype=np.float32)
def step(self, action):
total_reward = 0
last_states = []
for i in range(self.skip):
state, reward, done, info = self.env.step(action)
total_reward += reward
if i >= self.skip / 2:
last_states.append(state)
if done:
self.reset()
return self.states[None, :, :, :].astype(np.float32), total_reward, done, info
max_state = np.max(np.concatenate(last_states, 0), 0)
self.states[:-1] = self.states[1:]
self.states[-1] = max_state
return self.states[None, :, :, :].astype(np.float32), total_reward, done, info
def reset(self):
state = self.env.reset()
self.states = np.concatenate([state for _ in range(self.skip)], 0)
return self.states[None, :, :, :].astype(np.float32)
def create_train_env(world, stage, actions, output_path=None):
env = gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(world, stage))
if output_path:
monitor = Monitor(256, 240, output_path)
else:
monitor = None
env = JoypadSpace(env, actions)
env = CustomReward(env, monitor)
env = CustomSkipFrame(env)
return env
class MultipleEnvironments:
def __init__(self, world, stage, action_type, num_envs, output_path=None):
self.agent_conns, self.env_conns = zip(*[mp.Pipe() for _ in range(num_envs)])
'''选择操作模式
'''
if action_type == "right":
actions = RIGHT_ONLY
elif action_type == "simple":
actions = SIMPLE_MOVEMENT
else:
actions = COMPLEX_MOVEMENT
'''创建多环境
'''
self.envs = [create_train_env(world, stage, actions, output_path=output_path) for _ in range(num_envs)]
self.num_states = self.envs[0].observation_space.shape[0]
self.num_actions = len(actions)
'''创建多进程
'''
for index in range(num_envs):
process = mp.Process(target=self.run, args=(index,))
process.start()
self.env_conns[index].close()
def run(self, index):
self.agent_conns[index].close()
while True:
request, action = self.env_conns[index].recv()
if request == "step":
self.env_conns[index].send(self.envs[index].step(int(action)))
elif request == "reset":
self.env_conns[index].send(self.envs[index].reset())
else:
raise NotImplementedError
Overwriting game_env.py
模型文件
里面是马里奥模型
%%writefile MARIO/model.py
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import Conv2D, ReLU, Linear, Layer
import math
def conv_out(In):
return (In-3+2*1)//2+1
# (input−kernel_size+2*padding)//stride+1
class MARIO(Layer):
def __init__(self, input_num, actions):
super(MARIO, self).__init__()
self.num_input = input_num
self.channels = 32
self.kernel = 3
self.stride = 2
self.padding = 1
# self.fc = self.channels*math.pow(conv_out(conv_out(conv_out(conv_out(obs_dim[-1])))),2)
self.fc = 32 * 6 * 6
nn.initializer.set_global_initializer(nn.initializer.KaimingUniform(), nn.initializer.Constant(value=0.))
self.conv0 = Conv2D(out_channels=self.channels,
kernel_size=self.kernel,
stride=self.stride,
padding=self.padding,
dilation=[1, 1],
groups=1,
in_channels=input_num)
self.relu0 = ReLU()
self.conv1 = Conv2D(out_channels=self.channels,
kernel_size=self.kernel,
stride=self.stride,
padding=self.padding,
dilation=[1, 1],
groups=1,
in_channels=self.channels)
self.relu1 = ReLU()
self.conv2 = Conv2D(out_channels=self.channels,
kernel_size=self.kernel,
stride=self.stride,
padding=self.padding,
dilation=[1, 1],
groups=1,
in_channels=self.channels)
self.relu2 = ReLU()
self.conv3 = Conv2D(out_channels=self.channels,
kernel_size=self.kernel,
stride=self.stride,
padding=self.padding,
dilation=[1, 1],
groups=1,
in_channels=self.channels)
self.relu3 = ReLU()
self.linear0 = Linear(in_features=int(self.fc), out_features=512)
self.linear1 = Linear(in_features=512, out_features=actions)
self.linear2 = Linear(in_features=512, out_features=1)
def forward(self, x):
x = self.conv0(x)
x = self.relu0(x)
x = self.conv1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.relu2(x)
x = self.conv3(x)
x = self.relu3(x)
x = paddle.reshape(x, [x.shape[0], -1])
x = self.linear0(x)
logits = self.linear1(x)
value = self.linear2(x)
return logits, value
Overwriting model.py
RND 模型
用于生成内在奖励
%%writefile MARIO/rnd_model.py
import paddle.nn.functional as F
import paddle.nn as nn
import paddle
class Flatten(nn.Layer):
def forward(self, input):
return input.reshape([input.shape[0], -1])
class RNDModel(nn.Layer):
def __init__(self, input_size, output_size):
super(RNDModel, self).__init__()
self.input_size = input_size
self.output_size = output_size
feature_output = 7 * 7 * 64
nn.initializer.set_global_initializer(nn.initializer.KaimingNormal(), nn.initializer.Constant(value=0.0))
# 预测模型
self.predictor = nn.Sequential(
nn.Conv2D(
in_channels=self.input_size,
out_channels=32,
kernel_size=8,
stride=4),
nn.LeakyReLU(),
nn.Conv2D(
in_channels=32,
out_channels=64,
kernel_size=4,
stride=2),
nn.LeakyReLU(),
nn.Conv2D(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1),
nn.LeakyReLU(),
Flatten(),
nn.Linear(feature_output, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512)
)
# 随机网络
self.target = nn.Sequential(
nn.Conv2D(
in_channels=self.input_size,
out_channels=32,
kernel_size=8,
stride=4),
nn.LeakyReLU(),
nn.Conv2D(
in_channels=32,
out_channels=64,
kernel_size=4,
stride=2),
nn.LeakyReLU(),
nn.Conv2D(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1),
nn.LeakyReLU(),
Flatten(),
nn.Linear(feature_output, 512)
)
for index, param in enumerate(self.target.parameters()):
param.stop_gradient = True # 随机网络不需要梯度更新参数
self.target.parameters()[index] = param.sign() * param.abs().sqrt(2)
def forward(self, next_obs):
target_feature = self.target(next_obs)
predict_feature = self.predictor(next_obs)
return predict_feature, target_feature
# 如何计算内在奖励
def compute_intrinsic_reward(rnd, next_obs):
next_obs = paddle.to_tensor(next_obs, dtype="float32")
target_next_feature = rnd.target(next_obs)
predict_next_feature = rnd.predictor(next_obs)
intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum(1) / 2
return intrinsic_reward.numpy().item()
Overwriting rnd_model.py
训练文件
训练过程
%%writefile MARIO/train.py
import os
# 安装环境
os.system("pip install gym-super-mario-bros")
os.system("pip install gym")
os.system("clear")
os.environ['OMP_NUM_THREADS'] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from game_env import MultipleEnvironments
from game_env import create_train_env
from model import MARIO
from rnd_model import RNDModel, compute_intrinsic_reward
import paddle
import paddle.nn as nn
from paddle.distribution import Categorical
import paddle.nn.functional as F
import multiprocessing as _mp
import numpy as np
import shutil
from visualdl import LogWriter
from collections import deque
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
import time
from tqdm import trange
def eval(local_model, rnd_model, log_writer, eval_epch):
'''选择操作模式
'''
if action_type == "right":
actions = RIGHT_ONLY
elif action_type == "simple":
actions = SIMPLE_MOVEMENT
else:
actions = COMPLEX_MOVEMENT
env = create_train_env(world, stage, actions)
state = paddle.to_tensor(env.reset(), dtype="float32")
curr_step = 0
max_step = int(1e9)
extrinsic_reward = 0
intrinsic_reward = 0
max_reward = 0
actions = deque(maxlen=max_actions)
while True:
curr_step += 1
logits, value = local_model(state)
policy = F.softmax(logits, axis=-1).numpy()
action = np.argmax(policy)
state, reward, done, info = env.step(int(action))
# 外在奖励
extrinsic_reward += reward
# 内在奖励
intrinsic_reward += compute_intrinsic_reward(rnd_model, state)
'''通关时保存模型
'''
if info["flag_get"]:
print("Finished")
paddle.save(local_model.state_dict(),
"{}/mario_{}_{}.pdparams".format(saved_path, world, stage))
# aistudio 下无法显示
# env.render()
actions.append(action)
if curr_step > num_global_steps or actions.count(actions[0]) == actions.maxlen:
done = True
if done:
curr_step = 0
eval_epch += 1
actions.clear()
log_writer.add_scalar("Eval Extrinsic reward", value=paddle.to_tensor(extrinsic_reward), step=eval_epch)
log_writer.add_scalar("Eval Intrinsic reward", value=paddle.to_tensor(intrinsic_reward), step=eval_epch)
total_reward = 0
break
state = paddle.to_tensor(state, dtype="float32")
return eval_epch
def train():
if os.path.isdir(log_path):
shutil.rmtree(log_path)
os.makedirs(log_path)
if not os.path.isdir(saved_path):
os.makedirs(saved_path)
# 定义环境
envs = MultipleEnvironments(world, stage, action_type, num_processes)
# 定义模型
model = MARIO(envs.num_states, envs.num_actions)
# 定义RND模块
rnd = RNDModel(envs.num_states, envs.num_actions)
clip_grad = paddle.nn.ClipGradByNorm(clip_norm=0.5)
optimizer = paddle.optimizer.Adam(learning_rate=lr, parameters=model.parameters(), grad_clip=clip_grad)
forward_mse = nn.MSELoss(reduction='none')
log_writer = LogWriter(logdir = log_path, comment= "Super Mario Bros")
[agent_conn.send(("reset", None)) for agent_conn in envs.agent_conns] # 重置环境
curr_states = [agent_conn.recv() for agent_conn in envs.agent_conns] # 返回图像序列
curr_states = paddle.to_tensor(np.concatenate(curr_states, 0), dtype="float32")
curr_episode = 0
eval_epch = 0
while True:
'''定期保存模型
'''
if curr_episode % save_interval == 0 and curr_episode > 0:
paddle.save(model.state_dict(),
"{}/mario_{}_{}_{}.pdparams".format(saved_path, world, stage, curr_episode))
curr_episode += 1
old_log_policies, actions, values, states, next_states, rewards, dones = [], [], [], [], [], [], []
'''预热部分
'''
train_reward = 0
for _ in range(num_local_steps):
states.append(curr_states)
logits, value = model(curr_states)
values.append(value.squeeze())
policy = F.softmax(logits, axis=-1)
old_m = Categorical(policy)
action = old_m.sample([1]).squeeze()
actions.append(action)
origin_old_log_policy = old_m.log_prob(action)
# eye = paddle.eye(policy.shape[0])
# old_log_policy = paddle.sum(paddle.multiply(origin_old_log_policy, eye), axis=1).squeeze()
old_log_policy = paddle.tensor.tril(origin_old_log_policy)
old_log_policy = paddle.tensor.triu(old_log_policy)
old_log_policy = paddle.sum(old_log_policy, axis=1)
old_log_policies.append(old_log_policy)
# 与环境交互
[agent_conn.send(("step", act)) for agent_conn, act in zip(envs.agent_conns, action.numpy().astype("int16").tolist())]
state, reward, done, info = zip(*[agent_conn.recv() for agent_conn in envs.agent_conns])
# for _ in range(len(info)):
# if info[_]["flag_get"]:
# print("Thread_{} Finished".format(_))
train_reward += np.mean(reward)
state = paddle.to_tensor(np.concatenate(state, 0), dtype="float32")
reward = paddle.to_tensor(reward, dtype="float32")
done = paddle.to_tensor(done, dtype="float32")
rewards.append(reward)
dones.append(done)
# 收集用于 RND 训练的预测数据
next_states.append(state)
curr_states = state
log_writer.add_scalar("Training Reward", value=paddle.to_tensor(train_reward, dtype="float32"), step=curr_episode)
_, next_value, = model(curr_states)
next_value = next_value.squeeze()
old_log_policies = paddle.concat(old_log_policies, axis=-1).detach()
actions = paddle.concat(actions).squeeze()
values = paddle.concat(values).squeeze().detach()
states = paddle.concat(states).squeeze()
next_states = paddle.concat(next_states).squeeze()
gae = paddle.to_tensor([0.])
R = []
'''PG 优势函数计算过程
'''
for value, reward, done in list(zip(values, rewards, dones))[::-1]:
gae = gae * gamma * tau
gae = gae + reward + gamma * next_value.detach() * (1.0 - done) - value.detach()
next_value = value
R.append(gae + value.detach())
R = R[::-1] # 倒序
R = paddle.concat(R).detach()
advantages = R - values
for i in trange(num_epochs):
model.train()
indice = paddle.randperm(num_local_steps * num_processes) # 返回一个 0 到 n-1 的数组
for j in range(batch_size):
batch_indices = indice[
int(j * (num_local_steps * num_processes / batch_size)): int((j + 1) * (
num_local_steps * num_processes / batch_size))]
# 提取训练数据
batch_advantages = paddle.gather(advantages, batch_indices, axis=0)
batch_R = paddle.gather(R, batch_indices, axis=0)
batch_old_log_policies = paddle.gather(old_log_policies, batch_indices, axis=0)
batch_states = paddle.gather(states, batch_indices, axis=0)
batch_next_states = paddle.gather(next_states, batch_indices, axis=0)
batch_actions = paddle.gather(actions, batch_indices, axis=0)
# RND 部分:计算内在奖励
predict_next_state_feature, target_next_state_feature = rnd(batch_next_states)
forward_loss = forward_mse(predict_next_state_feature, target_next_state_feature.detach()).mean(-1)
mask = paddle.rand([len(forward_loss)])
mask = (mask < update_proportion).cast("float32")
forward_loss = (forward_loss * mask).sum() / paddle.max(paddle.concat([mask.sum(), paddle.to_tensor([1.0])]))
# 模型训练
logits, value = model(batch_states)
new_policy = F.softmax(logits, axis=-1)
new_m = Categorical(new_policy)
origin_new_log_policy = new_m.log_prob(batch_actions)
# eye = paddle.eye(new_policy.shape[0])
# new_log_policy = paddle.sum(paddle.multiply(origin_new_log_policy, eye), axis=1).squeeze()
new_log_policy = paddle.tensor.tril(origin_new_log_policy)
new_log_policy = paddle.tensor.triu(new_log_policy)
new_log_policy = paddle.sum(new_log_policy, axis=1).squeeze()
ratio = paddle.exp(new_log_policy - batch_old_log_policies)
actor_loss = paddle.concat([paddle.unsqueeze(ratio * batch_advantages, axis=0), \
paddle.unsqueeze(paddle.clip(ratio, 1.0 - epsilon, 1.0 + epsilon) * batch_advantages, axis=0)])
actor_loss = -paddle.mean(paddle.min(actor_loss, axis=0))
# critic_loss = paddle.mean((batch_R - value.squeeze()).pow(2)) / 2
critic_loss = F.smooth_l1_loss(batch_R, value.squeeze())
entropy_loss = paddle.mean(new_m.entropy())
total_loss = actor_loss + critic_loss - beta * entropy_loss + forward_loss
if not str(total_loss.numpy().item()) == "nan":
pass
else:
continue
optimizer.clear_grad()
total_loss.backward()
optimizer.step()
print("Episode: {}. Total loss: {}, Forward loss: {}".format(curr_episode, total_loss.numpy().item(), forward_loss.numpy().item()))
model.eval()
eval_epch = eval(model, rnd, log_writer, eval_epch)
if not str(total_loss.numpy().item()) == "nan":
log_writer.add_scalar("Total loss", value=total_loss, step=curr_episode)
log_writer.add_scalar("Forward loss", value=forward_loss, step=curr_episode)
else:
continue
'''不需要调整的全局变量
'''
gamma = 0.9 # 奖励的折算因子
tau = 1.0 # GAE(Generalized Advantage Estimation), 即优势函数的参数
beta = 0.01 # 交叉熵的系数
epsilon = 0.2 # 裁剪后的替代目标函数(PPO 提出)的参数
# RND 相关参数
update_proportion=0.25
batch_size = 16
num_epochs = 10
num_local_steps = 512
num_global_steps = int(5e6)
save_interval = 50 # 定期保存间隔
max_actions = 512
log_path = "./log" # 日志保存路径
saved_path = "./models"
'''可以调整的全局变量
'''
world = 4 # 世界
stage = 4 # 关卡
action_type = "simple" # 操作模式
num_processes = 6 # 线程数
lr = float(4e-5) # 学习率
if __name__ == "__main__":
paddle.seed(314)
print("Proximal Policy Optimization Algorithms (PPO) playing Super Mario Bros")
print("Training Processes:{}".format(num_processes))
train()
Overwriting train.py
运行一下吧
%cd ~
%cd MARIO
%env PYTHONPATH=.:$PYTHONPATH
!python train.py
RND-PPO 玩蒙特祖马的复仇(Jupyter环境下运行)
链接:https://pan.baidu.com/s/1aHVQSJEmCM98Hof9YfL7iw
提取码:0uzl
安装一下依赖,以防万一
!pip install atari_py
解压游玩蒙特祖马的程序文件(只需要解压一次)
# 只需要运行一次
#!unzip -oq /home/aistudio/RND-Montezuma-Revange.zip
可以开始训练啦
蒙特祖马游戏难度蛮高的,所以要有耐心噢
%cd ~
%cd RND-Montezuma-Revange
%env PYTHONPATH=.:$PYTHONPATH
!python train.py
附:如何可视化查看训练数据
点击可视化,添加训练脚本所在目录下的“log”文件夹
(本图仅供参考)