参考链接:https://spinningup.qiwihui.com/zh_CN/latest/spinningup/rl_intro3.html
需要配合spinningup的公式推导
全部代码
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import gym
from gym.spaces import Discrete, Box
gym.logger.set_level(40)
# 定义一个多层神经网络
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
layers = []
# 每一层之后接一个tanh函数,最后一层直接输出
# Identity是一个占位符,输入是什么,输出就是什么
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
epochs=50, batch_size=5000, render=False):
# 导入CartPole-v0的环境
env = gym.make(env_name)
# 检查状态空间是不是Box,动作空间是不是Discrete类型的
assert isinstance(env.observation_space, Box), \
"This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
"This example only works for envs with discrete action spaces."
# 状态空间的维度 (4,)
obs_dim = env.observation_space.shape[0]
# 离散动作的数量 2
n_acts = env.action_space.n
# sizes = [4,32,2]
# 网络的结构是:第一层 4 32 tanh 第二层 32 2 nn.Identity(没有激活函数)
logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])
# make function to compute action distribution
# 计算动作的分布
def get_policy(obs):
logits = logits_net(obs) # 网络的输出,有正有负
# 一般来说,网络的最后一层接一个softmax层,这样得到的是每一个离散动作的概率
# pytorch的Categorical有两种使用方式
# 一种是传入props即概率,要求:向量的每一个值都是大于0的,且和为1
# 第二种就是直接传入网络的输出logits,内部会自动处理
# The `logits` argument will be interpreted as unnormalized log probabilities
# and can therefore be any real number.
return Categorical(logits=logits)
# make action selection function (outputs int actions, sampled from policy)
def get_action(obs):
# 输入观测值,得到一个类别概率的分布,然后采样,得到一个tensor,最后用item()得到采样的值
# 这里采样的结果是下标,若分布是一个2维向量,则采样结果只可能是0或者1
return get_policy(obs).sample().item()
# make loss function whose gradient, for the right data, is policy gradient
# 实际上这里计算的是目标函数的梯度
def compute_loss(obs, act, weights):
# 注意:这里传入的是批次数据(多条轨迹),所以最后求平均值,为平均loss,或者
logp = get_policy(obs).log_prob(act) #计算输出动作的概率的log值(具体看PG的数学推导)
return -(logp * weights).mean() # 加了负号,最终是梯度上升的
# make optimizer
optimizer = Adam(logits_net.parameters(), lr=lr)
# for training policy
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_rets = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# reset episode-specific variables
obs = env.reset() # first obs comes from starting distribution
done = False # signal from environment that episode is over
ep_rews = [] # 一条轨迹的奖励记录,即每一步的奖励
# render first episode of each epoch
finished_rendering_this_epoch = False
# collect experience by acting in the environment with current policy
# 这里是采样的过程,采样多条轨迹
while True:
# rendering
if (not finished_rendering_this_epoch) and render:
env.render()
# save obs
batch_obs.append(obs.copy())
# act in the environment
act = get_action(torch.as_tensor(obs, dtype=torch.float32))
# 和环境进行交互
obs, rew, done, _ = env.step(act) # 走一步奖励+1
# save action, reward
batch_acts.append(act)
ep_rews.append(rew) # [1,1,1,1,1,....]
if done:
# 一条采样轨迹的结束,游戏的终结
# 一条轨迹的奖励采用简单的相加,而不是折扣奖励。所以这里没有折扣因子
# 这里需要知道规则:carpole-v0的规则是走一步奖励为1
ep_ret, ep_len = sum(ep_rews), len(ep_rews)
# 将一条轨迹的累计回报和轨迹长度记录到批次数据中
batch_rets.append(ep_ret)
batch_lens.append(ep_len)
# the weight for each logprob(a|s) is R(tau)
# 这里的*不是乘法,而是复制ep_len个,因为每一个都需要计算总奖励
#(详见spinningup的推导公式)
batch_weights += [ep_ret] * ep_len
# 重置环境和一条轨迹的记录,因为一个episode已经结束
obs, done, ep_rews = env.reset(), False, []
# won't render again this epoch
finished_rendering_this_epoch = True
# 采样数量足够多才退出while True
# batch_size = 5000
if len(batch_obs) > batch_size:
break
# 优化器梯度清零,每次更新的时候都需要做,因为梯度会累加
optimizer.zero_grad()
# 这里是计算梯度,batch_loss是一个标量。
batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
act=torch.as_tensor(batch_acts, dtype=torch.int32),
weights=torch.as_tensor(batch_weights, dtype=torch.float32)
)
# 梯度反向传播
batch_loss.backward()
# 梯度更新,因为梯度加了负号,所以这里实际上是梯度上升,loss会越来越大
optimizer.step()
return batch_loss, batch_rets, batch_lens
# training loop
for i in range(epochs):
batch_loss, batch_rets, batch_lens = train_one_epoch()
print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
# 只要运行时该变量有传参就将该变量设置为True,即如果需要render只需要:--render即可
parser.add_argument('--render', action='store_true')
parser.add_argument('--lr', type=float, default=1e-2)
args = parser.parse_args()
print('\nUsing simplest formulation of policy gradient.\n')
train(env_name=args.env_name, render=args.render, lr=args.lr)