import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import random
import gym
import time
# PPO actor-critic模型
class Model(nn.Module):
def __init__(self, num_inputs, num_outputs):
super(Model, self).__init__()
h_size_1 = 100
h_size_2 = 100
self.v_fc1 = nn.Linear(num_inputs, h_size_1*5)
self.v_fc2 = nn.Linear(h_size_1*5, h_size_2)
self.v = nn.Linear(h_size_2, 1)
self.p_fc1 = nn.Linear(num_inputs, h_size_1)
self.p_fc2 = nn.Linear(h_size_1, h_size_2)
self.mu = nn.Linear(h_size_2, num_outputs)
self.log_std = nn.Parameter(torch.zeros(1, num_outputs))
for name, p in self.named_parameters():
# init parameters
if 'bias' in name:
p.data.fill_(0)
self.train()
def forward(self, inputs):
# actor
x = F.tanh(self.p_fc1(inputs))
x = F.tanh(self.p_fc2(x))
mu = self.mu(x)
sigma_sq = torch.exp(self.log_std)
# critic
x = F.tanh(self.v_fc1(inputs))
x = F.tanh(self.v_fc2(x))
v = self.v(x)
return mu, sigma_sq, v
# 定义共享梯度区类
class Shared_grad_buffers():
def __init__(self, model):
self.grads = {}
for name, p in model.named_parameters():
self.grads[name+'_grad'] = torch.ones(p.sizes()).share_memory_()
def add_gradient(self, model):
for name, p in model.named_parameters():
self.grads[name+'_grad'] += p.grad.data
def reset(self):
for name, grad in self.grads.items():
self.grads[name].fill_(0)
# 定义状态的规范化
class Shared_obs_stats():
def __init__(self, num_inputs):
self.n = torch.zeros(num_inputs).share_memory_()
self.mean = torch.zeros(num_inputs).share_memory_()
self.mean_diff = torch.zeros(num_inputs).share_memory_()
self.var = torch.zeros(num_inputs).share_memory_()
def observes(self, obs):
# observation mean var updates
x = obs.data.squeeze()
self.n += 1
last_mean = self.mean.clone()
self.mean += (x-self.mean)/self.n
self.mean_diff += (x-last_mean)*(x-self.mean)
self.var = torch.clamp(self.mean_diff/self.n, min=1e-2)
def normalize(self, inputs):
obs_mean = Variable(self.mean.unsqueeze(0).expand_as(inputs))
obs_std = Variable(torch.sqrt(self.var).unsqueeze(0).expand_as(inputs))
return torch.clamp((inputs-obs_mean)/obs_std, -5., 5.)
# 经验复用类
class ReplayMemory(object):
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
def push(self, events):
for event in zip(*events):
self.memory.append(event)
if len(self.memory) > self.capacity:
del self.memory[0]
def clear(self):
self.memory = []
def sample(self, batch_size):
samples = zip(*random.sample(self.memory, batch_size))
return map(lambda x:torch.cat(x, 0), samples)
# 训练worker
def train(self, params, traffic_light, counter, shared_model, shared_grad_buffers, shared_obs_stats):
torch.manual_seed(params.seed)
env = gym.make(params.env_name)
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]
model = Model(num_inputs, num_outputs)
memory = ReplayMemory(params.exploration_size)
state = env.reset()
state = Variable(torch.Tensor(state).unsqueeze(0))
done = True
episode_length = 0
while True:
episode_length += 1
model.load_state_dict(shared_model.state_dict())
w = -1
av_reward = 0
t = -1
# Perform K steps
for step in range(params.num_steps):
w += 1
shared_obs_stats.observes(state)
state = shared_obs_stats.normalize(state)
states.append(state)
mu, sigma_sq, v = model(state)
eps = torch.randn(mu.size())
action = (mu + sigma_sq.sqrt()*Variable(eps))
actions.append(action)
values.append(v)
env_action = action.data.squeeze().numpy()
state, reward, done, _ = env.step(env_action)
done = (done or episode_length >= params.max_episode_length)
cum_reward += reward
reward = max(min(reward, 1), -1)
rewards.append(reward)
if done:
cum_done += 1
av_reward += cum_reward
cum_reward = 0
episode_length = 0
state = env.reset()
state = Variable(torch.Tensor(state).unsqueeze(0))
if done:
break
R = torch.zeros(1, 1)
if not done:
_, _, v = model(state)
R = v.data
values.append(Variable(R))
R = Variable(R)
A = Variable(torch.zeros(1, 1))
for i in reversed(range(len(rewards))):
td = rewards[i] + params.gamma*values[i+1].data[0, 0] - values[i].data[0, 0]
A = float(td) + params.gamma*params.gae_param*A
advantages.insert(0, A)
R = A + values[i]
returns.insert(0, R)
# store usefull info:
memory.push([states, actions, returns, advantages])
av_reward /= float(cum_done + 1)
model_old = Model(num_inputs, num_outputs)
model_old.load_state_dict(model.state_dict())
for k in range(params.num_epoch):
# load new model
model.load_state_dict(shared_model.state_dict())
model.zero_grad()
# get initial signal
signal_init = traffic_light.get()
# new mini_batch
batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(params.batch_size)
# old probas
mu_old, sigma_sq_old, v_pred_old = model_old(batch_states.detach())
probs_old = normal(batch_actions, mu_old, sigma_sq_old)
# new probas
mu, sigma_sq, v_pred = model(batch_states)
probs = normal(batch_actions, mu, sigma_sq)
surr1 = ratio * torch.cat([batch_advantages]*num_outputs, 1)
surr2 = ratio.clamp(1-params.clip, 1+params.clip, 1+params.clip)*torch.cat([batch_advantages]*num_outputs, 1)
loss_clip = -torch.mean(torch.min(surr1, surr2))
# value loss
vfloss1 = (v_pred - batch_returns)**2
v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp(-params.clip, params.clip)
vfloss2 = (v_pred_clipped - batch_returns)**2
loss_value = 0.5*torch.mean(torch.max(vfloss1, vfloss2))
# entropy
loss_ent = -params.ent_coeff*torch.mean(probs*torch.log(probs+1e-5))
total_loss = (loss_clip + loss_value + loss_ent)
total_loss.backward(retain_variables = True)
shared_grad_buffers.add_gradient(model)
counter.increment()
while traffic_light.get() == signal_init:
pass
# 优化chief
def chief(self, params, traffic_light, counter, shared_model, shared_grad_buffers, optimizers):
while True:
time.sleep(1)
# workers will wait after last loss computation
if counter.get() > params.update_treshold:
for n, p in shared_model.named_parameters():
p._grad = Variable(shared_grad_buffers.grads[n+'_grad'])
optimizers.step()
counter.reset()
shared_grad_buffers.reset()
traffic_light.switch() # workers start new loss computation