Policy Based

代码抄写自《tensorflow实战》一书,以便大家运行测试学习。

#coding:utf-8
#!pip install gym

import numpy as np
import tensorflow as tf
import gym

# 创建CartPole问题的环境env
env = gym.make('CartPole-v0')

# 先测试在CartPole环境中使用随机Action的表现,作为接下来对比的baseline。
env.reset() # 初始化环境
random_episodes = 0
reward_sum = 0
while random_episodes < 10:
    env.render() # 将CartPole问题的图像渲染出来
    observation, reward, done, _ = env.step(np.random.randint(0, 2))
    reward_sum += reward
    if done:
        random_episodes += 1
        print("Reward for this episode was:", reward_sum)
        reward_sum = 0
        env.reset()

# 我们的策略网络使用简单的带有一个隐含层的MLP。
H = 50 # 隐含层节点数
batch_size = 25
learning_rate = 1e-1
D = 4 # 环境信息observation的维度为4
gamma = 0.99 # reward的discount比例

xs, ys, drs = [], [], []
reward_sum = 0
episode_number = 1
total_episodes = 10000

observations = tf.placeholder(tf.float32, [None, D], name='input_x')
w1 = tf.get_variable('w1', shape=[D, H], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations, w1))
w2 = tf.get_variable('w2', shape=[H, 1], initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1, w2)
probability = tf.nn.sigmoid(score)

# 我们定义人工设置的虚拟label的placeholder---input_y,以及每个Action的潜在价值的placeholder---
# advangtages.

input_y = tf.placeholder(tf.float32, [None, 1], name="input_y")
advantages = tf.placeholder(tf.float32, name="reward_signal")
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik * advantages)
tvars = tf.trainable_variables()
newGrads = tf.gradients(loss, tvars)

# 这里模型的优化器使用Adam算法。我们分别设置两层神经网络参数的梯度:W1Grad和W2Grad,并使用
# adam.apply_gradients定义我们更新模型参数的操作updateGrads。之后计算参数的梯度,当累积到一
# 定样本量的梯度,就传入W1Grad和W2Grad,并执行updateGrads更新模型参数。我们不逐个样本地更新
# 参数,而是累计一个batch_size的样本的梯度再更新参数,防止单一样本随机扰动的噪声对模型带来
# 不良影响
adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
W1Grad = tf.placeholder(tf.float32, name='batch_grad1')
W2Grad = tf.placeholder(tf.float32, name='batch_grad2')
batchGrad = [W1Grad, W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad, tvars))

# 下面定义discount_rewards,用来估算每一个Action对应的潜在价值discount_r
def discount_rewards(r):
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


with tf.Session() as sess:
    rendering = False
    init = tf.global_variables_initializer()
    sess.run(init)
    obervation = env.reset()

    gradBuffer = sess.run(tvars)
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0

    while episode_number <= total_episodes:
        if reward_sum / batch_size > 100 or rendering == True:
            env.render()
            rendering = True
        x = np.reshape(observation, [1, D])

        tfprob = sess.run(probability, feed_dict={observations: x})
        action = 1 if np.random.uniform() < tfprob else 0
        xs.append(x)
        y = 1 - action
        ys.append(y)

        observation, reward, done, info = env.step(action)
        reward_sum += reward
        drs.append(reward)
        if done:
            episode_number += 1
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            xs, ys, drs = [], [], []

            discounted_epr = discount_rewards(epr)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)

            tGrad = sess.run(newGrads, feed_dict={observations: epx, input_y: epy,
                                                  advantages: discounted_epr})
            for ix, grad in enumerate(tGrad):
                gradBuffer[ix] += grad

            if episode_number % batch_size == 0:
                sess.run(updateGrads, feed_dict={W1Grad: gradBuffer[0], W2Grad: gradBuffer[1]})

                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0

                print('Average reward for episode %d: %f.' % (episode_number, reward_sum / batch_size))

                if reward_sum / batch_size > 200:
                    print('Task solved in', episode_number, 'episodes!')
                    break

                reward_sum = 0
            observation = env.reset()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值