import torch
class SharedAdam ( torch. optim. Adam) :
def __init__ ( self, params, lr= 1e - 3 , betas= ( 0.9 , 0.99 ) , eps= 1e - 8 ,
weight_decay= 0 ) :
super ( SharedAdam, self) . __init__( params, lr= lr, betas= betas, eps= eps, weight_decay= weight_decay)
for group in self. param_groups:
for p in group[ 'params' ] :
state = self. state[ p]
state[ 'step' ] = 0
state[ 'exp_avg' ] = torch. zeros_like( p. data)
state[ 'exp_avg_sq' ] = torch. zeros_like( p. data)
state[ 'exp_avg' ] . share_memory_( )
state[ 'exp_avg_sq' ] . share_memory_( )
from torch import nn
import torch
import numpy as np
def v_wrap ( np_array, dtype= np. float32) :
if np_array. dtype != dtype:
np_array = np_array. astype( dtype)
return torch. from_numpy( np_array)
def set_init ( layers) :
for layer in layers:
nn. init. normal_( layer. weight, mean= 0 . , std= 0.1 )
nn. init. constant_( layer. bias, 0 . )
def push_and_pull ( opt, lnet, gnet, done, s_, bs, ba, br, gamma) :
if done:
v_s_ = 0 .
else :
v_s_ = lnet. forward( v_wrap( s_[ None , : ] ) ) [ - 1 ] . data. numpy( ) [ 0 , 0 ]
buffer_v_target = [ ]
for r in br[ : : - 1 ] :
v_s_ = r + gamma * v_s_
buffer_v_target. append( v_s_)
buffer_v_target. reverse( )
loss = lnet. loss_func(
v_wrap( np. vstack( bs) ) ,
v_wrap( np. array( ba) , dtype= np. int64) if ba[ 0 ] . dtype == np. int64 else v_wrap( np. vstack( ba) ) ,
v_wrap( np. array( buffer_v_target) [ : , None ] ) )
opt. zero_grad( )
loss. backward( )
for lp, gp in zip ( lnet. parameters( ) , gnet. parameters( ) ) :
gp. _grad = lp. grad
opt. step( )
lnet. load_state_dict( gnet. state_dict( ) )
def record ( global_ep, global_ep_r, ep_r, res_queue, name) :
with global_ep. get_lock( ) :
global_ep. value += 1
with global_ep_r. get_lock( ) :
if global_ep_r. value == 0 . :
global_ep_r. value = ep_r
else :
global_ep_r. value = global_ep_r. value * 0.99 + ep_r * 0.01
res_queue. put( global_ep_r. value)
print (
name,
"Ep:" , global_ep. value,
"| Ep_r: %.0f" % global_ep_r. value,
)
import torch
import torch. nn as nn
from utils import v_wrap, set_init, push_and_pull, record
import torch. nn. functional as F
import torch. multiprocessing as mp
from shared_adam import SharedAdam
import gym
import os
os. environ[ "OMP_NUM_THREADS" ] = "1"
UPDATE_GLOBAL_ITER = 5
GAMMA = 0.9
MAX_EP = 3000
env = gym. make( 'CartPole-v0' )
N_S = env. observation_space. shape[ 0 ]
N_A = env. action_space. n
class Net ( nn. Module) :
def __init__ ( self, s_dim, a_dim) :
super ( Net, self) . __init__( )
self. s_dim = s_dim
self. a_dim = a_dim
self. pi1 = nn. Linear( s_dim, 128 )
self. pi2 = nn. Linear( 128 , a_dim)
self. v1 = nn. Linear( s_dim, 128 )
self. v2 = nn. Linear( 128 , 1 )
set_init( [ self. pi1, self. pi2, self. v1, self. v2] )
self. distribution = torch. distributions. Categorical
def forward ( self, x) :
pi1 = torch. tanh( self. pi1( x) )
logits = self. pi2( pi1)
v1 = torch. tanh( self. v1( x) )
values = self. v2( v1)
return logits, values
def choose_action ( self, s) :
self. eval ( )
logits, _ = self. forward( s)
prob = F. softmax( logits, dim= 1 ) . data
m = self. distribution( prob)
return m. sample( ) . numpy( ) [ 0 ]
def loss_func ( self, s, a, v_t) :
self. train( )
logits, values = self. forward( s)
td = v_t - values
c_loss = td. pow ( 2 )
probs = F. softmax( logits, dim= 1 )
m = self. distribution( probs)
exp_v = m. log_prob( a) * td. detach( ) . squeeze( )
a_loss = - exp_v
total_loss = ( c_loss + a_loss) . mean( )
return total_loss
'''
# 连续动作
class Net(nn.Module):
def __init__(self, s_dim, a_dim):
super(Net, self).__init__()
self.s_dim = s_dim
self.a_dim = a_dim
self.a1 = nn.Linear(s_dim, 200)
self.mu = nn.Linear(200, a_dim)
self.sigma = nn.Linear(200, a_dim)
self.c1 = nn.Linear(s_dim, 100)
self.v = nn.Linear(100, 1)
set_init([self.a1, self.mu, self.sigma, self.c1, self.v])
self.distribution = torch.distributions.Normal # 正态分布
def forward(self, x):
a1 = F.relu6(self.a1(x))
mu = 2 * F.tanh(self.mu(a1)) # 均值
sigma = F.softplus(self.sigma(a1)) + 0.001 # avoid 0,方差
c1 = F.relu6(self.c1(x))
values = self.v(c1) # 价值
return mu, sigma, values
def choose_action(self, s):
self.training = False
mu, sigma, _ = self.forward(s)
m = self.distribution(mu.view(1, ).data, sigma.view(1, ).data) # 根据正态分布概率随机动作
return m.sample().numpy()
def loss_func(self, s, a, v_t):
self.train()
mu, sigma, values = self.forward(s)
td = v_t - values
c_loss = td.pow(2)
m = self.distribution(mu, sigma)
log_prob = m.log_prob(a)
entropy = 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(m.scale) # exploration,熵项
exp_v = log_prob * td.detach() + 0.005 * entropy
a_loss = -exp_v
total_loss = (a_loss + c_loss).mean()
return total_loss
'''
class Worker ( mp. Process) :
def __init__ ( self, gnet, opt, global_ep, global_ep_r, res_queue, name) :
super ( Worker, self) . __init__( )
self. name = 'w%02i' % name
self. g_ep, self. g_ep_r, self. res_queue = global_ep, global_ep_r, res_queue
self. gnet, self. opt = gnet, opt
self. lnet = Net( N_S, N_A)
self. env = gym. make( 'CartPole-v0' ) . unwrapped
def run ( self) :
total_step = 1
while self. g_ep. value < MAX_EP:
s = self. env. reset( )
buffer_s, buffer_a, buffer_r = [ ] , [ ] , [ ]
ep_r = 0 .
while True :
if self. name == 'w00' :
self. env. render( )
a = self. lnet. choose_action( v_wrap( s[ None , : ] ) )
s_, r, done, _ = self. env. step( a)
'''
s_, r, done, _ = self.env.step(a.clip(-2, 2)) # 连续动作
'''
if done:
r = - 1
ep_r += r
buffer_a. append( a)
buffer_s. append( s)
buffer_r. append( r)
if total_step % UPDATE_GLOBAL_ITER == 0 or done:
push_and_pull( self. opt, self. lnet, self. gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA)
buffer_s, buffer_a, buffer_r = [ ] , [ ] , [ ]
if done:
record( self. g_ep, self. g_ep_r, ep_r, self. res_queue, self. name)
break
s = s_
total_step += 1
self. res_queue. put( None )
if __name__ == "__main__" :
gnet = Net( N_S, N_A)
gnet. share_memory( )
opt = SharedAdam( gnet. parameters( ) , lr= 1e - 4 , betas= ( 0.92 , 0.999 ) )
global_ep, global_ep_r, res_queue = mp. Value( 'i' , 0 ) , mp. Value( 'd' , 0 . ) , mp. Queue( )
workers = [ Worker( gnet, opt, global_ep, global_ep_r, res_queue, i) for i in range ( mp. cpu_count( ) ) ]
[ w. start( ) for w in workers]
res = [ ]
while True :
r = res_queue. get( )
if r is not None :
res. append( r)
else :
break
[ w. join( ) for w in workers]
import matplotlib. pyplot as plt
plt. plot( res)
plt. ylabel( 'Moving average ep reward' )
plt. xlabel( 'Step' )
plt. show( )