current status : [ 0.03557839 0.01374288 0.02786276 -0.01108547]
current status : [ 0.03585325 0.20845439 0.02764105 -0.29484879]
current status : [ 0.04002234 0.40317159 0.02174408 -0.57868758]
current status : [ 0.04808577 0.59798216 0.01017032 -0.86444211]
current status : [ 0.06004541 0.7929642 -0.00711852 -1.15391002]
current status : [ 0.0759047 0.59793582 -0.03019672 -0.86346768]
current status : [ 0.08786341 0.40323769 -0.04746607 -0.58043012]
current status : [ 0.09592817 0.20881188 -0.05907467 -0.30306967]
current status : [ 0.1001044 0.01457943 -0.06513607 -0.02958707]
current status : [ 0.10039599 -0.17955089 -0.06572781 0.24185443]
current status : [ 0.09680497 -0.37367539 -0.06089072 0.51310237]
current status : [ 0.08933147 -0.56788925 -0.05062867 0.78599244]
current status : [ 0.07797368 -0.76228042 -0.03490882 1.0623271 ]
current status : [ 0.06272807 -0.95692321 -0.01366228 1.3438523 ]
current status : [ 0.04358961 -1.15187066 0.01321476 1.6322296 ]
current status : [ 0.0205522 -0.95690635 0.04585936 1.34369369]
current status : [ 0.00141407 -0.76239033 0.07273323 1.06570434]
current status : [-0.01383374 -0.56830228 0.09404732 0.79670671]
current status : [-0.02519978 -0.37458801 0.10998145 0.53502857]
current status : [-0.03269154 -0.18117051 0.12068202 0.27892462]
current status : [-0.03631495 0.01204167 0.12626051 0.02660872]
current status : [-0.03607412 0.20514808 0.12679269 -0.22372307]
current status : [-0.03197116 0.3982513 0.12231823 -0.47387547]
current status : [-0.02400613 0.59145275 0.11284072 -0.72564042]
current status : [-0.01217708 0.78484852 0.09832791 -0.98078435]
current status : [ 0.00351989 0.97852494 0.07871222 -1.24103385]
current status : [ 0.02309039 1.17255307 0.05389155 -1.50805742]
current status : [ 0.04654145 1.36698187 0.0237304 -1.78344076]
current status : [ 0.07388109 1.56182922 -0.01193842 -2.06865342]
current status : [ 0.10511767 1.36683066 -0.05331149 -1.77968624]
current status : [ 0.13245429 1.17234758 -0.08890521 -1.50404222]
current status : [ 0.15590124 0.97840986 -0.11898606 -1.24038829]
current status : [ 0.17546944 0.78499952 -0.14379382 -0.98722345]
current status : [ 0.19116943 0.59206511 -0.16353829 -0.74293846]
current status : [ 0.20301073 0.3995327 -0.17839706 -0.5058594 ]
current status : [ 0.21100138 0.20731387 -0.18851425 -0.27427808]
current status : [ 0.21514766 0.01531152 -0.19399981 -0.0464724 ]
current status : [ 0.21545389 -0.17657605 -0.19492926 0.17928067]
current status : [ 0.21192237 -0.36845204 -0.19134364 0.40469689]
current status : [ 0.20455333 -0.56041869 -0.18324971 0.63147929]
current status : [ 0.19334496 -0.75257502 -0.17062012 0.86131339]
current status : [ 0.17829345 -0.94501418 -0.15339385 1.09586142]
current status : [ 0.15939317 -1.13782025 -0.13147662 1.33675355]
current status : [ 0.13663677 -1.33106382 -0.10474155 1.58557432]
current status : [ 0.11001549 -1.52479607 -0.07303007 1.84384237]
current status : [ 0.07951957 -1.71904073 -0.03615322 2.11298095]
current status : [ 0.04513875 -1.91378346 0.0061064 2.39427744]
current status : [ 0.00686308 -1.71871547 0.05399195 2.10347617]
current status : [-0.02751122 -1.52417484 0.09606147 1.82795638]
current status : [-0.05799472 -1.33023996 0.1326206 1.56659389]
current status : [-0.08459952 -1.13692847 0.16395248 1.31804919]
current status : [-0.10733809 -0.94421495 0.19031346 1.08084174]
在有监督学习中,NN可以通过交叉熵等方法去对决策做优化,但是在RL中哦你,唯一可以指导决策的就是奖励(reward)。但是reward一般都是有很大的延迟,即在决策之后很长时间,才会有当前决策的reward。对此,常用的解决方案是:基于一个行为以及其之后的reward对该行为计算reward,后来的reward有一个衰减系数r
r
,,如果r=0,则相当于只采用当前step的reward
# 1. Specify the network architecture
n_inputs = 4# == env.observation_space.shape[0]
n_hidden = 4# it's a simple task, we don't need more than this
n_outputs = 1# only outputs the probability of accelerating left
initializer = tf.contrib.layers.variance_scaling_initializer()
# 2. Build the neural network
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu,
kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs, activation=tf.nn.sigmoid,
kernel_initializer=initializer)
# 3. Select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
init = tf.global_variables_initializer()
n_max_steps = 1000
frames = []
angles =[]
env.reset()
with tf.Session() as sess:
init.run()
obs = env.reset()
for step in range(max_steps):
img = render_cart_pole(env, obs)
frames.append(img)
action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
angles.append( obs[2] )
if done:
break
env.close()
plt.plot( angles )
[<matplotlib.lines.Line2D at 0x7fb1c00efd30>]
在上面的操作中,不使用任何信息,最终agent会不稳定
reset_graph()
n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01
initializer = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
y = tf.placeholder(tf.float32, shape=[None, n_outputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(cross_entropy)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_environments = 10
n_iterations = 1000
envs = [gym.make("CartPole-v0") for _ in range(n_environments)]
observations = [env.reset() for env in envs]
angles = np.zeros( (n_iterations, n_environments) )
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
target_probas = np.array([([1.] if obs[2] < 0else [0.]) for obs in observations]) # if angle<0 we want proba(left)=1., or else proba(left)=0.
action_val, _ = sess.run([action, training_op], feed_dict={X: np.array(observations), y: target_probas})
for env_index, env in enumerate(envs):
obs, reward, done, info = env.step(action_val[env_index][0])
angles[iteration, env_index] = obs[2]
observations[env_index] = obs ifnot done else env.reset()
saver.save(sess, "./models/RL/policy_net_basic.ckpt")
for env in envs:
env.close()
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Restoring parameters from ./models/RL/policy_net_pg.ckpt