【RL笔记】基于tensorflow实现RL的policy based算法

前言

Reinforcement Learning是AI的一个重要方向,本文实现来基于tensorflow的policy based算法

代码

import numpy as np 
import tensorflow as tf 
import gym

env = gym.make('CartPole-v0')
env.reset()
random_episodes = 0
reward_sum = 0
'''
while random_episodes < 10:
	env.render()
	observation,reward,done,_ = env.step(np.random.randint(0,2))
	reward_sum += reward
	if done:
		random_episodes += 1
		print("Reward for this episode was:",reward_sum)
		reward_sum = 0
		env.reset
'''
H = 50
batch_size = 25
learning_rate = 1e-1
D = 4
gamma = 0.99

tvars = tf.trainable_variables()

observations = tf.placeholder(tf.float32,[None,D],name="input_x")
w1 = tf.get_variable("w1",shape=[D,H],initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.relu(tf.matmul(observations,w1))
w2 = tf.get_variable("w2",shape=[H,1],initializer=tf.contrib.layers.xavier_initializer())
score = tf.matmul(layer1,w2)
probability = tf.nn.sigmoid(score)

adam= tf.train.AdamOptimizer(learning_rate=learning_rate)
w1Grad = tf.placeholder(tf.float32,name="batch_grad1")
w2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [w1Grad,w2Grad]
updateGrad = adam.apply_gradients(zip(batchGrad,tvars))

def discount_reward(r):
	discount_r = np.zero_like(r)
	running_add = 0
	for t in reversed(range(r.size)):
		running_add = running_add * gamma + r[t]
		discount_r[t] = running_add
	return discount_r

input_y = tf.placeholder(tf.float32,[None,1],name="input_y")
advantages = tf.placeholder(tf.float32,name="reward_signal")
loglik = tf.log(input_y * (input_y - probability) + (1 - input_y) * (input_y + probability))
loss = -tf.reduce_mean(loglik * advantages)
newGrads = tf.gradients(loss,tvars)


xs,ys,drs = [],[],[]
reward_sum = 0
episode_number = 1
total_episode = 10000

with tf.Session as sess:
	rendering = False
	init = tf.global_variables_initializer()
	sess.run(init)
	observation = env.reset()

	gradBuffer = sess.run(tvars)
	for ix,grad in enumerate(gradBuffer):
		gradBuffer[ix] = grad * 0

	while episode_number <= total_episode:
		if reward_sum/batch_size > 100 or rendering == True:
			env.render()
			rendering = True

		x = np.reshape(observation,[1,D])

		tfprob = sess.run(probability,feed_dict={observations:x})
		action = 1 if np.random.uniform() < tfprob else 0

		xs.qppend(x)
		y = 1 - action
		ys.append(y)

		observation,reward,done,info = env.step(action)
		reward_sum += reward
		drs.append(reward)

		if done:
			episode_number += 1
			epx = np.vstack(xs)
			epy = np.vstack(ys)
			epr = np.vstack(drs)
			xs,ys,drs = [],[],[]

			discounted_epr = discount_reward(epr)
			discounted_epr -= np.mean(discounted_epr)
			discounted_epr /= np.std(discounted_epr)

			tGrad = sess.run(newGrads,feed_dict={observations:epx,input_y:epy,advantages:discounted_epr})
			for ix,grad in enumerate(tGrad):
				gradBuffer[ix] += grad

				if episode_number % batch_size == 0:
					sess.run(updateGrad,feed_dict={w1Grad:gradBuffer[0],w2Grad:gradBuffer[1]})
					for ix,grad in enumerate(gradBuffer):
						gradBuffer[ix] = grad * 0
					
					print('Average reward for episode %d : %f.' %(episode_number,reward_sum/batch_size))

					if reward_sum/batch_size > 200:
						print("Task solved in",episode_number,'episodes!')
						break

					reward_sum = 0

				observation = env.reset()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值