PPO的代码实现

整个PPO2的代码实现流程如下

待完善:
-[ ]训练
-[ ]游戏环境,后面会运行跑一下马里奥
基本流程是如下的

  • 建立网络的类actor-critic
    • 一个是actor网络,输入是state,输出是action
    • 一个是critic网络,输入是state,输出是reward
  • PPO整体的类。后续会完整介绍一下代码
import gym
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt


class policy_net(torch.nn.Module):
    def __init__(self,state_dim,action_dim,hidden_dim):
        super(policy_net,self).__init__()
        self.f1 = torch.nn.Linear(state_dim,hidden_dim)
        self.f2 = torch.nn.Linear(hidden_dim,action_dim)
    def forward(self,x):
        x = F.relu(self.f1(x))
        return F.softmax(self.f2(x),dim =1)

class value_net(torch.nn.Module):
    def __init__(self,state_dim,hidden_dim):
        super(value_net,self).__init__()
        self.f1 = torch.nn.Linear(state_dim,hidden_dim)
        self.f2 = torch.nn.Linear(hidden_dim,1)
    def forward(self,x):
        x = F.relu(self.f1(x))
        return self.f2(x)

class PPO:
    def __init__(self,state_dim,action_dim,hidden_dim,lr_p,lr_v,lmbda,epochs,eps,gamma,device):


        self.action_net = policy_net(state_dim,action_dim,hidden_dim)
        self.critic_net = value_net(state_dim,hidden_dim)
        self.actor_opt = torch.optim.Adam(self.action_net.parameters(),lr=lr_a)

        self.cri_opt = torch.optim.Adam(self.critic_net.parameters(),lr=lr_c)
        self.lr_a = lr_p
        self.lr_c = lr_v

        self.device = device
        self.gamma = gamma
        self.lmbda = lmbda
        self.epochs = epochs
        self.eps = eps #截断的数值
        
    def take_action(self,state):
        state = torch.tensor([state],torch.float).to(self.device)
        prob = self.action_net(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()

    def update(self,tmp):
        states = torch.tensor(tmp['states'],dtype = torch.float).to(self.device)
        rewards = torch.tensor(tmp['rewards'],dtype = torch.float).view(-1,1).to(self.device)
        actions = torch.tensor(tmp['actions'],dtype = torch.float).views(-1,1).to(self.device)
                
        dones= torch.tensor(tmp['dones'],dtype = torch.float).views(-1,1).to(self.device)
        next_states = torch.tensor(tmp['next_states'],dtype = torch.float).to(self.device)
        td_target = rewards + self.gamma *self.critic_net(next_states) *(1-dones)
        td_delta = td_target - critic_net(states)
        adv = self.compute_advantage(gamma=self.gamma,lmbda = self.lmbda,td_delta)


        old_log_probs = torch.log(self.action_net(states).gather(1,actions)).detach()

        for _ in range(self.epochs):
            log_probs = torch.log(self.action_net(states).gather(1,actions))
            ratio = torch.exp(log_probs - old_log_probs)
            surr1 =ratio * adv
            surr2 = torch.clamp(ratio,1-self.eps,1+self.eps)*adv

            actor_loss = torch.mean(-torch.min(surr1,surr2))
            cri_loss = torch.mean(F.mse_loss(self.critic_net(state),td_target.detach()))

            self.actor_opt.zero_grad()
            self.critic_net.zero_grad()

            actor_loss.backward()
            cri_loss,backward()
            self.actor_opt.step()
            self.cri_opt.step()

    


            




    def compute_advantage(gamma,lmbda,td_delta):
        td_delta = td_delta.detach().numpy()
        advantage_list = []
        advantage = 0.0
        for delta in td_delta[::1]:
            advantage = gamma*lmbda*advantage + delta
            advantage_list.append(advantage)
        advantage_list.reverse()# 翻转
        return torch.tensor(advantage_list,dtype = torch.float)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是一个简单的PPO算法的PyTorch实现,主要参考了OpenAI baselines中的实现。这里以CartPole-v1为例: ```python import torch import torch.nn as nn from torch.distributions import Categorical import gym class ActorCritic(nn.Module): def __init__(self, obs_shape, action_space): super(ActorCritic, self).__init__() self.observation_space = obs_shape self.action_space = action_space self.actor_fc1 = nn.Linear(obs_shape[0], 64) self.actor_fc2 = nn.Linear(64, action_space.n) self.critic_fc1 = nn.Linear(obs_shape[0], 64) self.critic_fc2 = nn.Linear(64, 1) self.log_probs = [] self.values = [] self.rewards = [] self.masks = [] def act(self, obs): actor_x = torch.tanh(self.actor_fc1(obs)) action_scores = self.actor_fc2(actor_x) dist = Categorical(logits=action_scores) action = dist.sample() self.log_probs.append(dist.log_prob(action)) return action.item() def evaluate(self, obs): actor_x = torch.tanh(self.actor_fc1(obs)) action_scores = self.actor_fc2(actor_x) dist = Categorical(logits=action_scores) action = dist.sample() log_prob = dist.log_prob(action) critic_x = torch.tanh(self.critic_fc1(obs)) value = self.critic_fc2(critic_x) self.log_probs.append(log_prob) self.values.append(value) return action.item(), value.item() def clear_memory(self): del self.log_probs[:] del self.values[:] del self.rewards[:] del self.masks[:] class PPO: def __init__(self, env_name, batch_size=64, gamma=0.99, clip_param=0.2, ppo_epoch=10, lr=3e-4, eps=1e-5): self.env = gym.make(env_name) self.obs_space = self.env.observation_space self.act_space = self.env.action_space self.clip_param = clip_param self.ppo_epoch = ppo_epoch self.batch_size = batch_size self.gamma = gamma self.eps = eps self.lr = lr self.net = ActorCritic(self.obs_space.shape, self.act_space) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr, eps=self.eps) self.net.train() def get_batch(self): obs = self.obs_buf[np.asarray(self.batch_ids)] actions = self.act_buf[np.asarray(self.batch_ids)] rewards = self.rew_buf[np.asarray(self.batch_ids)] dones = self.done_buf[np.asarray(self.batch_ids)] next_obs = self.obs_buf[np.asarray(self.batch_ids) + 1] masks = 1 - dones.astype(np.float32) return obs, actions, rewards, next_obs, masks def learn(self, obs, actions, rewards, next_obs, masks): obs = torch.tensor(obs, dtype=torch.float32) actions = torch.tensor(actions, dtype=torch.float32) rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1) masks = torch.tensor(masks, dtype=torch.float32).unsqueeze(1) next_obs = torch.tensor(next_obs, dtype=torch.float32) with torch.no_grad(): _, next_value = self.net.evaluate(next_obs) advantage = rewards + self.gamma * masks * next_value - self.net.values[-1] returns = [] gae = 0 lambda_ = 0.95 for i in reversed(range(len(rewards))): delta = rewards[i] + self.gamma * masks[i] * self.net.values[i + 1] - self.net.values[i] gae = delta + self.gamma * masks[i] * lambda_ * gae returns.insert(0, gae + self.net.values[i]) returns = torch.tensor(returns, dtype=torch.float32) for _ in range(self.ppo_epoch): for ind in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.batch_size, False): log_prob, value = self.net.evaluate(obs[ind]) ratio = torch.exp(log_prob - self.net.log_probs[ind]) adv = advantage[ind] surr1 = ratio * adv surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * adv actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (returns[ind] - value).pow(2).mean() loss = actor_loss + 0.5 * critic_loss # optimize self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.net.clear_memory() def run(self, max_iter=10000): obs = self.env.reset() episode_reward = 0 for i in range(max_iter): action = self.net.act(torch.tensor(obs, dtype=torch.float32)) next_obs, reward, done, _ = self.env.step(action) episode_reward += reward self.net.rewards.append(reward) self.net.masks.append(1 - done) obs = next_obs if done: obs = self.env.reset() self.net.clear_memory() if i % self.batch_size == 0 and i != 0: self.learn(*self.get_batch()) if i % 100 == 0 and i != 0: print('Episode {}, Reward: {:.2f}'.format(i, episode_reward / 100)) episode_reward = 0 ``` 上述代码中,我们首先定义了一个ActorCritic类,它包括一个Actor和一个Critic。Actor根据当前状态输出动作的概率分布,并根据分布进行采样;Critic则输出当前状态的价值。在PPO算法中,我们需要同时更新Actor和Critic。PPO算法的核心在于计算Advantage,可以参考第一篇回答中的解释。此外,我们还需要使用一个BatchSampler对数据进行采样。在run函数中,我们首先根据当前状态获取一个动作,然后执行该动作,并将相关的信息存储在ActorCritic类中。如果当前episode结束,我们则清空ActorCritic类中的信息,并重置环境。如果当前步数可以被batch_size整除,我们则进行PPO算法的更新。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值