深度强化学习 学术前沿与实战应用——PPO

import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import random
import gym
import time

# PPO actor-critic模型
class Model(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(Model, self).__init__()
        h_size_1 = 100
        h_size_2 = 100

        self.v_fc1 = nn.Linear(num_inputs, h_size_1*5)
        self.v_fc2 = nn.Linear(h_size_1*5, h_size_2)
        self.v = nn.Linear(h_size_2, 1)

        self.p_fc1 = nn.Linear(num_inputs, h_size_1)
        self.p_fc2 = nn.Linear(h_size_1, h_size_2)
        self.mu = nn.Linear(h_size_2, num_outputs)
        self.log_std = nn.Parameter(torch.zeros(1, num_outputs))

        for name, p in self.named_parameters():
            # init parameters
            if 'bias' in name:
                p.data.fill_(0)
        self.train()

    def forward(self, inputs):
        # actor
        x = F.tanh(self.p_fc1(inputs))
        x = F.tanh(self.p_fc2(x))
        mu = self.mu(x)
        sigma_sq = torch.exp(self.log_std)
        # critic
        x = F.tanh(self.v_fc1(inputs))
        x = F.tanh(self.v_fc2(x))
        v = self.v(x)
        return mu, sigma_sq, v

# 定义共享梯度区类
class Shared_grad_buffers():
    def __init__(self, model):
        self.grads = {}
        for name, p in model.named_parameters():
            self.grads[name+'_grad'] = torch.ones(p.sizes()).share_memory_()

    def add_gradient(self, model):
        for name, p in model.named_parameters():
            self.grads[name+'_grad'] += p.grad.data

    def reset(self):
        for name, grad in self.grads.items():
            self.grads[name].fill_(0)

# 定义状态的规范化
class Shared_obs_stats():
    def __init__(self, num_inputs):
        self.n = torch.zeros(num_inputs).share_memory_()
        self.mean = torch.zeros(num_inputs).share_memory_()
        self.mean_diff = torch.zeros(num_inputs).share_memory_()
        self.var = torch.zeros(num_inputs).share_memory_()


    def observes(self, obs):
        # observation mean var updates
        x = obs.data.squeeze()
        self.n += 1
        last_mean = self.mean.clone()
        self.mean += (x-self.mean)/self.n
        self.mean_diff += (x-last_mean)*(x-self.mean)
        self.var = torch.clamp(self.mean_diff/self.n, min=1e-2)

    def normalize(self, inputs):
        obs_mean = Variable(self.mean.unsqueeze(0).expand_as(inputs))
        obs_std = Variable(torch.sqrt(self.var).unsqueeze(0).expand_as(inputs))
        return torch.clamp((inputs-obs_mean)/obs_std, -5., 5.)

# 经验复用类
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, events):
        for event in zip(*events):
            self.memory.append(event)
            if len(self.memory) > self.capacity:
                del self.memory[0]

    def clear(self):
        self.memory = []

    def sample(self, batch_size):
        samples = zip(*random.sample(self.memory, batch_size))
        return map(lambda x:torch.cat(x, 0), samples)

    # 训练worker
    def train(self, params, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats):
        torch.manual_seed(params.seed)
        env = gym.make(params.env_name)
        num_inputs = env.observation_space.shape[0]
        num_outputs = env.action_space.shape[0]

        model = Model(num_inputs, num_outputs)
        memory = ReplayMemory(params.exploration_size)
        state = env.reset()
        state = Variable(torch.Tensor(state).unsqueeze(0))
        done = True
        episode_length = 0

        while True:
            episode_length += 1
            model.load_state_dict(shared_model.state_dict())
            w = -1
            av_reward = 0
            t = -1

            # Perform K steps
            for step in range(params.num_steps):
                w += 1
                shared_obs_stats.observes(state)
                state = shared_obs_stats.normalize(state)
                states.append(state)
                mu, sigma_sq, v = model(state)
                eps = torch.randn(mu.size())
                action = (mu + sigma_sq.sqrt()*Variable(eps))
                actions.append(action)
                values.append(v)
                env_action = action.data.squeeze().numpy()
                state, reward, done, _ = env.step(env_action)
                done = (done or episode_length >= params.max_episode_length)
                cum_reward += reward
                reward = max(min(reward, 1), -1)
                rewards.append(reward)

                if done:
                    cum_done += 1
                    av_reward += cum_reward
                    cum_reward = 0
                    episode_length = 0
                    state = env.reset()
                state = Variable(torch.Tensor(state).unsqueeze(0))
                if done:
                    break

                    R = torch.zeros(1, 1)
                    if not done:
                        _, _, v = model(state)
                        R = v.data
                    values.append(Variable(R))
                    R = Variable(R)
                    A = Variable(torch.zeros(1, 1))

                    for i in reversed(range(len(rewards))):
                        td = rewards[i] + params.gamma*values[i+1].data[0, 0] - values[i].data[0, 0]
                        A = float(td) + params.gamma*params.gae_param*A
                        advantages.insert(0, A)
                        R = A + values[i]
                        returns.insert(0, R)
                    # store usefull info:
                    memory.push([states, actions, returns, advantages])

                    av_reward /= float(cum_done + 1)
                    model_old = Model(num_inputs, num_outputs)
                    model_old.load_state_dict(model.state_dict())

                    for k in range(params.num_epoch):
                        # load new model
                        model.load_state_dict(shared_model.state_dict())
                        model.zero_grad()
                        # get initial signal
                        signal_init = traffic_light.get()

                        # new mini_batch
                        batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(params.batch_size)
                        # old probas
                        mu_old, sigma_sq_old, v_pred_old = model_old(batch_states.detach())
                        probs_old = normal(batch_actions, mu_old, sigma_sq_old)
                        # new probas
                        mu, sigma_sq, v_pred = model(batch_states)
                        probs = normal(batch_actions, mu, sigma_sq)

                        surr1 = ratio * torch.cat([batch_advantages]*num_outputs, 1)
                        surr2 = ratio.clamp(1-params.clip, 1+params.clip, 1+params.clip)*torch.cat([batch_advantages]*num_outputs, 1)
                        loss_clip = -torch.mean(torch.min(surr1, surr2))

                        # value loss
                        vfloss1 = (v_pred - batch_returns)**2
                        v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp(-params.clip, params.clip)
                        vfloss2 = (v_pred_clipped - batch_returns)**2
                        loss_value = 0.5*torch.mean(torch.max(vfloss1, vfloss2))

                        # entropy
                        loss_ent = -params.ent_coeff*torch.mean(probs*torch.log(probs+1e-5))

                        total_loss = (loss_clip + loss_value + loss_ent)
                        total_loss.backward(retain_variables = True)
                        shared_grad_buffers.add_gradient(model)
                        counter.increment()
                        while traffic_light.get() == signal_init:
                            pass

    # 优化chief
    def chief(self, params, traffic_light, counter, shared_model, shared_grad_buffers, optimizers):
        while True:
            time.sleep(1)
            # workers will wait after last loss computation
            if counter.get() > params.update_treshold:
                for n, p in shared_model.named_parameters():
                    p._grad = Variable(shared_grad_buffers.grads[n+'_grad'])
                optimizers.step()
                counter.reset()
                shared_grad_buffers.reset()
                traffic_light.switch() # workers start new loss computation

 

 

 

 

 

 

 

 

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值