tf actor critic Pendulum-v0 钟摆

原文链接: tf actor critic Pendulum-v0 钟摆

上一篇: gym Pendulum-v0

下一篇: tf a3c Pendulum-v0 钟摆

学习率太大,或者相差太多,会有训练完后的波动问题, 有时候在一个点效果很好, 但是会波动, 变得很差, 网络重新收敛

设置较小的学习率,train的时间长一点可以解决波动问题

actor的学习率比critic的学习率要小

actor的需要拟合一个正态分布来进行action的采样

奖励分布

随机动作的奖励分布,200步的情况

越接近0, 表示杆越竖直

fd39b8bb0467a284200790e8146d227c0b8.jpg

import gym
import time
import matplotlib.pyplot as plt

name = "Pendulum-v0"
# name = "CartPole-v0"
env = gym.make(name)
env.reset()
action = env.action_space.sample()
print(env.action_space)
print(env.observation_space)
observation, reward, done, info = env.step(action)
print(observation, reward, done, info)
r_list = []
for _ in range(200):
    action = env.action_space.sample()
    print('action', action.shape)
    observation, reward, done, info = env.step(action)
    reward /= 16.5
    r_list.append(reward)
    print(reward)
    env.render()
    time.sleep(.01)

plt.hist(r_list, 100, range=(-1, 0))
plt.show()
env.close()

训练

2d4424a2fb3aefc02e6a40faaadb19d4b28.jpg

比较好的结果, 数值集中在0附近

e5f5d50a886c0036d7d6d2996721cb03ac5.jpg

奖励大于-0.3的比率,可以看到是逐渐在上升的

de0a2319a9d11e08fd5afcf9a524d3eeb5f.jpg

import tensorflow as tf
import numpy as np
import gym
from tensorflow.contrib import slim
import matplotlib.pyplot as plt
from tensorflow_probability import distributions as tfd


class Actor(object):
    def __init__(self, sess, n_features, action_bound, lr=0.001):
        self.sess = sess
        self.s = tf.placeholder(tf.float32, [1, n_features], "state")
        self.a = tf.placeholder(tf.float32, (1,), name="act")
        self.td_error = tf.placeholder(tf.float32, None, name="td_error")  # TD_error

        net = slim.fully_connected(
            self.s,
            32,
            tf.nn.leaky_relu
        )
        mu = slim.fully_connected(
            net, 1, tf.nn.tanh
        )
        sigma = slim.fully_connected(
            net, 1, tf.nn.softplus
        )
        global_step = tf.Variable(0, trainable=False)
        # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
        self.mu, self.sigma = tf.squeeze(mu * 2), tf.squeeze(sigma + 0.1)
        print('mu,sigma', self.mu.shape, self.sigma.shape)
        # self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
        self.normal_dist = tfd.Normal(self.mu, self.sigma)
        print('normal_dist', self.normal_dist)
        self.action = tf.clip_by_value(self.normal_dist.sample(1), action_bound[0], action_bound[1])
        print('action', self.action.shape)

        with tf.name_scope('exp_v'):
            log_prob = self.normal_dist.log_prob(self.a)  # loss without advantage
            self.exp_v = log_prob * self.td_error  # advantage (TD_error) guided loss
            # Add cross entropy cost to encourage exploration
            self.exp_v += 0.01 * self.normal_dist.entropy()

        with tf.name_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v, global_step)  # min(v) = max(-v)

    def learn(self, s, a, td):
        s = s[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td_error: td}
        _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        return self.sess.run(self.action, {self.s: s})  # get probabilities for all actions


class Critic(object):
    def __init__(self, sess, n_features, lr=0.002):
        self.sess = sess
        with tf.name_scope('inputs'):
            self.s = tf.placeholder(tf.float32, [1, n_features], "state")
            self.v_ = tf.placeholder(tf.float32, [1, 1], name="v_next")
            self.r = tf.placeholder(tf.float32, name='r')

        with tf.variable_scope('Critic'):
            net = slim.fully_connected(
                self.s, 32, tf.nn.leaky_relu
            )
            self.v = slim.fully_connected(
                net, 1, tf.nn.leaky_relu
            )

        with tf.variable_scope('squared_TD_error'):
            self.td_error = tf.reduce_mean(self.r + GAMMA * self.v_ - self.v)
            self.loss = tf.square(self.td_error)  # TD_error = (r+gamma*V_next) - V_eval
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)

    def learn(self, s, r, s_):
        s, s_ = s[np.newaxis, :], s_[np.newaxis, :]

        v_ = self.sess.run(self.v, {self.s: s_})
        td_error, _ = self.sess.run([self.td_error, self.train_op],
                                    {self.s: s, self.v_: v_, self.r: r})
        return td_error


OUTPUT_GRAPH = False
MAX_EPISODE = 1000
show_step = 50
MAX_EP_STEPS = 200
DISPLAY_REWARD_THRESHOLD = -100  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time
GAMMA = 0.9
LR_A = 0.001  # learning rate for actor
LR_C = 0.002  # learning rate for critic

env = gym.make('Pendulum-v0')
env = env.unwrapped

N_S = env.observation_space.shape[0]
A_BOUND = env.action_space.high


def main():
    sess = tf.Session()

    actor = Actor(sess, n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
    critic = Critic(sess, n_features=N_S, lr=LR_C)

    sess.run(tf.global_variables_initializer())

    if OUTPUT_GRAPH:
        tf.summary.FileWriter("logs/", sess.graph)
    sum_r_list = []
    rate_list = []  # r 大于-0.2 的比例,越大表示模型越好
    for i_episode in range(1, 1 + MAX_EPISODE):
        s = env.reset()
        t = 0
        r_list = []
        while True:
            if not i_episode % show_step:
                env.render()
            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)
            r /= 16.5

            td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
            actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]
            s = s_
            t += 1
            r_list.append(r)
            if t > MAX_EP_STEPS:
                print('loss', i_episode, sum(r_list), min(r_list), max(r_list))
                sum_r_list.append(sum(r_list))
                rate_list.append(np.sum(np.array(r_list) > -0.3) / MAX_EP_STEPS)
                break
        if not i_episode % show_step:
            plt.plot(sum_r_list)
            plt.show()
            plt.hist(r_list, 100, range=(-1, 0))
            plt.show()
            plt.plot(rate_list)
            plt.show()


if __name__ == '__main__':
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值