问题描述:
参考书的第八章《Atari Games with Deep Q Network》因为版本变更太多,所以本人直接更改源代码,从而实现程序的运行,但是因本人能力有限,本章代码只能单纯实现代码的运行
代码展示:
'''Building an agent to play Atari games'''
from cv2 import merge
from matplotlib import units
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras.layers import LSTM
from collections import deque,Counter
from tensorflow import keras
#定义preprocess_observation的函数,
# 用于预处理输入游戏屏幕,缩小图像的大小,图像转换成灰度。
coior = np.array([210,164,74]).mean()
def preprocess_observation(obs):
# Crop and resize the image
img = obs[1:176:2, ::2]
# Convert the image to greyscale
img = img.mean(axis=2)
# Next we normalize the image from -1 to +1
img = (img - 128) / 128 - 1
return img.reshape(88,80,1)
import gym
env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n
# env.render()
# print(n_outputs)
#define a q_network function for building our Q network. The input to our Q network will be the game state X.
tf.compat.v1.reset_default_graph()
tf.compat.v1.enable_eager_execution()
#@tf.function
def q_network(X,name_scope):
tf.compat.v1.disable_eager_execution()
#(None, 88, 80, 1)
#X = tf.compat.v1.placeholder(tf.float32, shape=X_shape)
X = tf.float32
#initialize layers
initializer = tf.keras.initializers.variance_scaling()
with tf.compat.v1.variable_scope(name_scope) as scope:
layer_1 = LSTM(units =10, kernel_initializer=initializer)
flat =tf.keras.layers.Flatten()
fc= tf.keras.layers.Dense(units = 10,kernel_initializer=initializer)
vars = {v.name[len(scope.name):]: v for v in tf.compat.v1.get_collection(key=tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
output = 32.0
return vars, output
#define an epsilon_greedy function
epsilon = 0.5
eps_min = 0.005
eps_max = 1.0
eps_decay_steps = 500000
def epsilon_greedy(action,step):
p = np.random.random(1).squeeze()
epsilon = max(eps_min,eps_max - (eps_max -eps_min) * step / eps_decay_steps)
if np.random.rand() <epsilon:
return np.random.randint(n_outputs)
else:
return action
#store all the agent's experience i.e (state, action, rewards) in the experience replay buffer
# sample from this minibatch of experience for training the network.
buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)
# initialize our experience replay buffer of length 20000, which holds the experience
def sample_memories(batch_size):
perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
men = np.array(exp_buffer)[perm_batch]
return men[:,0] ,men[:,1],men[:,2],men[:,3],men[:,4]
#define the network hyperParamaters
num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'
tf.compat.v1.disable_eager_execution()
#define placeholdedr
print("X_SHAPE",X_shape)
X = tf.compat.v1.placeholder(tf.float32, shape=X_shape)
#define fucntion to toggle the training
in_training_mode = tf.compat.v1.placeholder(tf.bool)
# #build the network
mainQ,mainQ_outputs = q_network(X,"mainQ")
targetQ,targetQ_outputs = q_network(X,"targetQ")
#define the placeholder for our action value
X_action = tf.compat.v1.placeholder(tf.int32,shape=(None,))
#Q_action = tf.reduce_sum(targetQ_outputs *tf.one_hot(X_action,n_outputs),axis = -1,keepdims=True)
Q_action = tf.reduce_sum(targetQ_outputs *tf.one_hot(X_action,n_outputs),axis = -1,keepdims=True)
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)
# define a placeholder for our output i.e action
y = tf.compat.v1.placeholder(tf.float32, shape=(None,1))
# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))
# we use adam optimizer for minimizing the loss
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
#training_op = optimizer.minimize(loss)
init = tf.compat.v1.global_variables_initializer()
#loss_summary = tf.compat.v1.summary.scalar('LOSS', loss)
merge_summary = tf.compat.v1.summary.merge_all()
file_writer = tf.compat.v1.summary.FileWriter(logdir, tf.compat.v1.get_default_graph())
with tf.compat.v1.Session() as sess:
init.run()
# for each episode
for i in range(num_episodes):
done = False
obs = env.reset()
epoch = 0
episodic_reward = 0
actions_counter = Counter()
episodic_loss = []
# while the state is not the terminal state
while not done:
env.render()
# get the preprocessed game screen
obs = preprocess_observation(obs)
# feed the game screen and get the Q values for each action
mainQ_outputs = str(mainQ_outputs)
#actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
actions = mainQ_outputs
# get the action
action = np.argmax(actions, axis=-1)
actions_counter[str(action)] += 1
# select the action using epsilon greedy policy
action = epsilon_greedy(action, global_step)
# now perform the action and move to the next state, next_obs, receive reward
next_obs, reward, done, _ = env.step(action)
# Store this transistion as an experience in the replay buffer
exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
# After certain steps, we train our Q network with samples from the experience replay buffer
if global_step % steps_train == 0 and global_step > start_steps:
# sample experience
o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
# states
o_obs = [x for x in o_obs]
# next states
o_next_obs = [x for x in o_next_obs]
# next actions
#next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
next_act = mainQ_outputs
# reward
#y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done)
o_rew =o_rew.astype(int)
o_done = o_done.astype(int)
o_rew = 0
o_done = 0
discount_factor = int(discount_factor)
next_act =32.0
y_batch = (discount_factor *next_act)
# merge all summaries and write to the file
#mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
mrg_summary = merge_summary
#file_writer.add_summary(mrg_summary, global_step)
# now we train the network and calculate loss
#train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
#episodic_loss.append(train_loss)
# after some interval we copy our main Q network weights to target Q network
if (global_step+1) % copy_steps == 0 and global_step > start_steps:
copy_target_to_main.run()
obs = next_obs
epoch += 1
global_step += 1
episodic_reward += reward
print('Epoch', epoch, 'Reward', episodic_reward,)
实现截图:
参考:
《Hands-on Reinforcement Learning with Python. Master Reinforcement and Deep Reinforcement Learning using OpenAI Gym and TensorFlow》