我们先回忆一下TRPO的优化目标:
对于这个优化目标,TRPO算法是采用泰勒展开近似之后直接解出答案,这个过程非常复杂,计算量也很大。在微积分中,我们知道对于这个有约束条件的最大值,我们可以利用拉格朗日乘数法转化为无约束条件下的最大值,这个就是PPO-惩罚算法。
PPO-惩罚算法用拉格朗日乘数法直接将KL散度的限制放进目标函数中,这就变成了一个无约束的优化问题,在迭代的过程中不断更新KL散度前的系数,即:
其中,δ是事先设定的超参数,用于限制学习策略和之前一轮策略的差距。
PPO的另一种形式PPO-截断更加直接,它在目标函数中进行限制,以保证新旧参数的差距不会太大,即:
其中,clip(x,l,r):=max(min(x,r),l),即把x限制在[l,r]内。是一个超参数,表示进行截断的范围。如果A(s,a)>0,说明这个动作的价值高于平均值,最大化这个式子会增大,但不会让其超过 。反之,如果A(s,a)<0,最大化这个式子会减小,但不会让其小于,如图:
这里可能有人会问,clip已经将概率比率控制在KL散度(约束条件)之内,为什么还要使用一个min函数呢?事实上,和取小是为了防止目标的变化过于剧烈,比如说原本没有进行截断的目标就已经满足了约束条件,如果此时再取到clip的边界值的话,那就会与上一个目标值的差距较大,目标变化过于明显,而两者取小就可以避免这种情况,从而保证了训练的稳定性。
大量实验表明,PPO-截断总是比PPO-惩罚的表现更好,因此下面我们专注于PPO-截断的代码实现。
我们仍然采用车杆环境与倒立摆环境。
导入库
import gym
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import rl_utils
定义策略网络和价值网络(与Actor-Critic算法相同)
class PolicyNet(torch.nn.Module):
def __init__(self,state_dim,hidden_dim,action_dim):
super(PolicyNet,self).__init__()
self.fc1=torch.nn.Linear(state_dim,hidden_dim)
self.fc2=torch.nn.Linear(hidden_dim,action_dim)
def forward(self,x):
x=F.relu(self.fc1(x))
return F.softmax(self.fc2(x),dim=1)
class ValueNet(torch.nn.Module):
def __init__(self,state_dim,hidden_dim):
super(ValueNet,self).__init__()
self.fc1=torch.nn.Linear(state_dim,hidden_dim)
self.fc2=torch.nn.Linear(hidden_dim,1)
def forward(self,x):
x=F.relu(self.fc1(x))
return self.fc2(x)
定义PPO-截断算法
class PPO:
"""PPO算法,采用截断方式"""
def __init__(self,state_dim,hidden_dim,action_dim,actor_lr,critic_lr,lmbda,epochs,eps,gamma,device):
self.actor=PolicyNet(state_dim,hidden_dim,action_dim).to(device)
self.critic=ValueNet(state_dim,hidden_dim).to(device)
#之前策略优化目标在有约束的条件下进行,则不能直接用梯度下降法进行更新,只能人工求解
#我们采用PPO算法之后,在策略目标时已经考虑到约束条件,并且满足了约束条件,则可以用梯度下降法进行更新,从而大大简化了代码复杂度
self.actor_optimizer=torch.optim.Adam(self.actor.parameters(),lr=actor_lr)
self.critic_optimizer=torch.optim.Adam(self.critic.parameters(),lr=critic_lr)
self.gamma=gamma
self.lmbda=lmbda
self.epochs=epochs #一条序列的数据用来训练轮数
self.eps=eps #PPO截断范围的参数
self.device=device
def take_action(self,state):
state=torch.tensor([state],dtype=torch.float).to(self.device)
probs=self.actor(state)
action_dist=torch.distributions.Categorical(probs)
action=action_dist.sample()
return action.item()
def update(self,transition_dict):
states = torch.tensor(transition_dict['states'],
dtype=torch.float).to(self.device)
actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
self.device)
rewards = torch.tensor(transition_dict['rewards'],
dtype=torch.float).view(-1, 1).to(self.device)
next_states = torch.tensor(transition_dict['next_states'],
dtype=torch.float).to(self.device)
dones = torch.tensor(transition_dict['dones'],
dtype=torch.float).view(-1, 1).to(self.device)
td_target=rewards+self.gamma*self.critic(next_states)*(1-dones)
td_delta=td_target-self.critic(states)
advantage=rl_utils.compute_advantage(self.gamma,self.lmbda,td_delta.cpu()).to(self.device)
old_log_probs=torch.log(self.actor(states).gather(1,actions)).detach()
for _ in range(self.epochs):
log_probs=torch.log(self.actor(states).gather(1,actions))
ratio=torch.exp(log_probs-old_log_probs)
surr1=ratio*advantage
surr2=torch.clamp(ratio,1-self.eps,1+self.eps)*advantage #截断
#最大化目标需要在约数条件(即KL散度限制之内)进行
#而PPO截断已经将概率比率控制在KL散度限制之内,满足了KL散度约束条件
#而surr1和surr2取最小值是为了防止策略目标的变化较大,保持训练的稳定性
#将目标作为损失函数,我们这里需要最大化目标,所以损失函数应该加上负号
actor_loss=torch.mean(-torch.min(surr1,surr2)) #PPO损失函数
critic_loss=torch.mean(F.mse_loss(self.critic(states),td_target.detach()))
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
actor_loss.backward()
critic_loss.backward()
self.actor_optimizer.step()
self.critic_optimizer.step()
设置超参数,进行车杆环境试验
actor_lr = 1e-3
critic_lr = 1e-2
num_episodes = 500
hidden_dim = 128
gamma = 0.98
lmbda = 0.95
epochs = 10
eps = 0.2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
"cpu")
env_name = 'CartPole-v0'
env = gym.make(env_name)
env.reset(seed=0)
torch.manual_seed(0)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda,
epochs, eps, gamma, device)
return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes)
绘图
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('PPO on {}'.format(env_name))
plt.show()
mv_return = rl_utils.moving_average(return_list, 9)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('PPO on {}'.format(env_name))
plt.show()
倒立摆是与连续动作交互的环境,与TRPO算法一样,我们让策略网络输出连续动作高斯分布的均值和标准差。后续连续动作在高斯分布中采样得到。
class PolicyNetContinuous(torch.nn.Module):
def __init__(self,state_dim,hidden_dim,action_dim):
super(PolicyNetContinuous,self).__init__()
self.fc1=torch.nn.Linear(state_dim,hidden_dim)
self.fc_mu=torch.nn.Linear(hidden_dim,action_dim)
self.fc_std=torch.nn.Linear(hidden_dim,action_dim)
def forward(self,x):
x=F.relu(self.fc1(x))
mu=2.0*torch.tanh(self.fc_mu(x))
std=F.softplus(self.fc_std(x))
return mu,std
定义连续动作的PPO-截断
class PPOContinuous:
"""处理连续动作的PPO算法"""
def __init__(self,state_dim,hidden_dim,action_dim,actor_lr,critic_lr,lmbda,epochs,eps,gamma,device):
self.actor=PolicyNetContinuous(state_dim,hidden_dim,action_dim).to(device)
self.critic=ValueNet(state_dim,hidden_dim).to(device)
self.actor_optimizer=torch.optim.Adam(self.actor.parameters(),lr=actor_lr)
self.critic_optimizer=torch.optim.Adam(self.critic.parameters(),lr=critic_lr)
self.gamma=gamma
self.lmbda=lmbda
self.epochs=epochs
self.eps=eps
self.device=device
def take_action(self,state):
state=torch.tensor([state],dtype=torch.float).to(self.device)
mu,sigma=self.actor(state)
action_dist=torch.distributions.Normal(mu,sigma)
action=action_dist.sample()
return [action.item()]
def update(self,transition_dict):
states = torch.tensor(transition_dict['states'],
dtype=torch.float).to(self.device)
actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
self.device)
rewards = torch.tensor(transition_dict['rewards'],
dtype=torch.float).view(-1, 1).to(self.device)
next_states = torch.tensor(transition_dict['next_states'],
dtype=torch.float).to(self.device)
dones = torch.tensor(transition_dict['dones'],
dtype=torch.float).view(-1, 1).to(self.device)
rewards=(rewards+8.0)/8.0 #和TRPO一样,对奖励进行修改,方便训练
td_target=rewards+self.gamma*self.critic(next_states)*(1-dones)
td_delta=td_target-self.critic(states)
advantage=rl_utils.compute_advantage(self.gamma,self.lmbda,td_delta.cpu()).to(self.device)
mu,std=self.actor(states)
action_dists=torch.distributions.Normal(mu.detach(),std.detach())
#动作是正态分布
old_log_probs=action_dists.log_prob(actions)
for _ in range(self.epochs):
mu,std=self.actor(states)
action_dists=torch.distributions.Normal(mu,std)
log_probs=action_dists.log_prob(actions)
ratio=torch.exp(log_probs-old_log_probs)
surr1=ratio*advantage
surr2=torch.clamp(ratio,1-self.eps,1+self.eps)*advantage
#最大化目标需要在约数条件(即KL散度限制之内)进行
#而PPO截断已经将概率比率控制在KL散度限制之内,满足了KL散度约束条件
#而surr1和surr2取最小值是为了防止策略目标的变化较大,保持训练的稳定性
#将目标作为损失函数,我们这里需要最大化目标,所以损失函数应该加上负号
actor_loss=torch.mean(-torch.min(surr1,surr2)) #PPO损失函数
critic_loss=torch.mean(F.mse_loss(self.critic(states),td_target.detach()))
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
actor_loss.backward()
critic_loss.backward()
self.actor_optimizer.step()
self.critic_optimizer.step()
设置超参数,进行倒立摆环境试验
actor_lr = 1e-4
critic_lr = 5e-3
num_episodes = 2000
hidden_dim = 128
gamma = 0.9
lmbda = 0.9
epochs = 10
eps = 0.2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
"cpu")
env_name = 'Pendulum-v1'
env = gym.make(env_name)
env.reset(seed=0)
torch.manual_seed(0)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] # 连续动作空间
agent = PPOContinuous(state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
lmbda, epochs, eps, gamma, device)
return_list = rl_utils.train_on_policy_agent(env, agent, num_episodes)
绘图
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('PPO on {}'.format(env_name))
plt.show()
mv_return = rl_utils.moving_average(return_list, 21)
plt.plot(episodes_list, mv_return)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('PPO on {}'.format(env_name))
plt.show()
可见,作为TRPO的改进算法,PPO简化TRPO中的复杂计算,并且它在实验中的性能绝大多数情况下比TRPO更好。