原文链接: tf actor critic Pendulum-v0 钟摆
上一篇: gym Pendulum-v0
学习率太大,或者相差太多,会有训练完后的波动问题, 有时候在一个点效果很好, 但是会波动, 变得很差, 网络重新收敛
设置较小的学习率,train的时间长一点可以解决波动问题
actor的学习率比critic的学习率要小
actor的需要拟合一个正态分布来进行action的采样
奖励分布
随机动作的奖励分布,200步的情况
越接近0, 表示杆越竖直
import gym
import time
import matplotlib.pyplot as plt
name = "Pendulum-v0"
# name = "CartPole-v0"
env = gym.make(name)
env.reset()
action = env.action_space.sample()
print(env.action_space)
print(env.observation_space)
observation, reward, done, info = env.step(action)
print(observation, reward, done, info)
r_list = []
for _ in range(200):
action = env.action_space.sample()
print('action', action.shape)
observation, reward, done, info = env.step(action)
reward /= 16.5
r_list.append(reward)
print(reward)
env.render()
time.sleep(.01)
plt.hist(r_list, 100, range=(-1, 0))
plt.show()
env.close()
训练
比较好的结果, 数值集中在0附近
奖励大于-0.3的比率,可以看到是逐渐在上升的
import tensorflow as tf
import numpy as np
import gym
from tensorflow.contrib import slim
import matplotlib.pyplot as plt
from tensorflow_probability import distributions as tfd
class Actor(object):
def __init__(self, sess, n_features, action_bound, lr=0.001):
self.sess = sess
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.a = tf.placeholder(tf.float32, (1,), name="act")
self.td_error = tf.placeholder(tf.float32, None, name="td_error") # TD_error
net = slim.fully_connected(
self.s,
32,
tf.nn.leaky_relu
)
mu = slim.fully_connected(
net, 1, tf.nn.tanh
)
sigma = slim.fully_connected(
net, 1, tf.nn.softplus
)
global_step = tf.Variable(0, trainable=False)
# self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
self.mu, self.sigma = tf.squeeze(mu * 2), tf.squeeze(sigma + 0.1)
print('mu,sigma', self.mu.shape, self.sigma.shape)
# self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
self.normal_dist = tfd.Normal(self.mu, self.sigma)
print('normal_dist', self.normal_dist)
self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
print('action', self.action.shape)
with tf.name_scope('exp_v'):
log_prob = self.normal_dist.log_prob(self.a) # loss without advantage
self.exp_v = log_prob * self.td_error # advantage (TD_error) guided loss
# Add cross entropy cost to encourage exploration
self.exp_v += 0.01 * self.normal_dist.entropy()
with tf.name_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step) # min(v) = max(-v)
def learn(self, s, a, td):
s = s[np.newaxis, :]
feed_dict = {self.s: s, self.a: a, self.td_error: td}
_, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
return exp_v
def choose_action(self, s):
s = s[np.newaxis, :]
return self.sess.run(self.action, {self.s: s}) # get probabilities for all actions
class Critic(object):
def __init__(self, sess, n_features, lr=0.002):
self.sess = sess
with tf.name_scope('inputs'):
self.s = tf.placeholder(tf.float32, [1, n_features], "state")
self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
self.r = tf.placeholder(tf.float32, name='r')
with tf.variable_scope('Critic'):
net = slim.fully_connected(
self.s, 32, tf.nn.leaky_relu
)
self.v = slim.fully_connected(
net, 1, tf.nn.leaky_relu
)
with tf.variable_scope('squared_TD_error'):
self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
with tf.variable_scope('train'):
self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
def learn(self, s, r, s_):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
v_ = self.sess.run(self.v, {self.s: s_})
td_error, _ = self.sess.run([self.td_error, self.train_op],
{self.s: s, self.v_: v_, self.r: r})
return td_error
OUTPUT_GRAPH = False
MAX_EPISODE = 1000
show_step = 50
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = -100 # renders environment if total episode reward is greater then this threshold
RENDER = False # rendering wastes time
GAMMA = 0.9
LR_A = 0.001 # learning rate for actor
LR_C = 0.002 # learning rate for critic
env = gym.make('Pendulum-v0')
env = env.unwrapped
N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high
def main():
sess = tf.Session()
actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
critic = Critic(sess, n_features=N_S, lr=LR_C)
sess.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
tf.summary.FileWriter("logs/", sess.graph)
sum_r_list = []
rate_list = [] # r 大于-0.2 的比例,越大表示模型越好
for i_episode in range(1, 1 + MAX_EPISODE):
s = env.reset()
t = 0
r_list = []
while True:
if not i_episode % show_step:
env.render()
a = actor.choose_action(s)
s_, r, done, info = env.step(a)
r /= 16.5
td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)]
actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error]
s = s_
t += 1
r_list.append(r)
if t > MAX_EP_STEPS:
print('loss', i_episode, sum(r_list), min(r_list), max(r_list))
sum_r_list.append(sum(r_list))
rate_list.append(np.sum(np.array(r_list) > -0.3) / MAX_EP_STEPS)
break
if not i_episode % show_step:
plt.plot(sum_r_list)
plt.show()
plt.hist(r_list, 100, range=(-1, 0))
plt.show()
plt.plot(rate_list)
plt.show()
if __name__ == '__main__':
main()