Import packages
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.misc
import os
import csv
import itertools
import tensorflow.contrib.slim as slim
%matplotlib inline
from helper import *
Load the game environment
from gridworld import gameEnv
# 创建 9*9 环境(agent可观察到完整环境)
env = gameEnv(partial=False,size=9)
# 创建 9*9 环境(agent可观察到以当前位置为中心的 3*3 环境)
env = gameEnv(partial=True,size=9)
Above are examples of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green squares (for +1 reward) and avoid the red squares (for -1 reward). When the agent moves through a green or red square, it is randomly moved to a new place in the environment.
Implementing the network itself
class Qnetwork():
def __init__(self,h_size,rnn_cell,myScope):
#The network recieves a frame from the game, flattened into an array.
#It then resizes it and processes it through four convolutional layers.
# 归一化后的输入的占位符 尺寸为 21168 列(84*84*3),行不定
self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32)
# imageIn 是第一层输入,将 scalarInput 恢复为 84*84*3 的图像
self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])
# 第一层卷积层,输入为 imageIn ,卷积核个数(filter个数) 32 ,
# 卷积核尺寸 8*8 ,步长 4*4 ,不做 padding ,biases 初始化器为空,故输出为 20*20*32.
self.conv1 = slim.convolution2d( \
inputs=self.imageIn,num_outputs=32,\
kernel_size=[8,8],stride=[4,4],padding='VALID', \
biases_initializer=None,scope=myScope+'_conv1')
# 第二层卷积层,输入为第一层输出,卷积核个数(filter个数)64,
# 卷积核尺寸 4*4 ,步长 2*2 ,不做 padding ,biases 初始化器为空,故输出为 9*9*64.
self.conv2 = slim.convolution2d( \
inputs=self.conv1,num_outputs=64,\
kernel_size=[4,4],stride=[2,2],padding='VALID', \
biases_initializer=None,scope=myScope+'_conv2')
# 第三层卷积层,输入为第二层输出,卷积核个数(filter个数)64,
# 卷积核尺寸 3*3 ,步长 1*1 ,不做 padding ,biases 初始化器为空,故输出为 7*7*64.
self.conv3 = slim.convolution2d( \
inputs=self.conv2,num_outputs=64,\
kernel_size=[3,3],stride=[1,1],padding='VALID', \
biases_initializer=None,scope=myScope+'_conv3')
# 第四层卷积层,输入为第三层输出,卷积核个数(filter个数)h_size(程序中定为512),
# 卷积核尺寸 7*7 ,步长 1*1 ,不做 padding ,biases 初始化器为空,故输出为 1*1*512.
self.conv4 = slim.convolution2d( \
inputs=self.conv3,num_outputs=h_size,\
kernel_size=[7,7],stride=[1,1],padding='VALID', \
biases_initializer=None,scope=myScope+'_conv4')
# 占位符
self.trainLength = tf.placeholder(dtype=tf.int32)
#We take the output from the final convolutional layer and send it to a recurrent layer.
#The input must be reshaped into [batch x trace x units] for rnn processing,
#and then returned to [batch x units] when sent through the upper levles.
# 占位符
self.batch_size = tf.placeholder(dtype=tf.int32,shape=[])
# 将卷积层输出的 1*1*h_size reshape 成 batch_size(1)*trainLength(1)*h_size(512)
self.convFlat = tf.reshape(slim.flatten(self.conv4),[self.batch_size,self.trainLength,h_size])
# 得到全0的初始状态(尺寸为 batch_size*state_size(num_units) 即 1*512)
self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32)
# 创建RNN,网络输入为 convFlat(batch_size(1)*num_units(512)), cell类型为rnn_cell,初始状态为全0。输出中 self.rnn 为RNN 输出 tensor(1*1*512),self.rnn_state 为网络最终的状态。
self.rnn,self.rnn_state = tf.nn.dynamic_rnn(\
inputs=self.convFlat,cell=rnn_cell,dtype=tf.float32,initial_state=self.state_in,scope=myScope+'_rnn')
# 将 RNN 输出 reshape 成 h_size(512) 列(1*512)
self.rnn = tf.reshape(self.rnn,shape=[-1,h_size])
#The output from the recurrent player is then split into separate Value and Advantage streams
# 将 RNN 的输出分为 value(环境价值) 和 advantage(action价值) 两段。
# 代表将 rnn 输出 tensor 的1个维度拆分成2段(1*256)
self.streamA,self.streamV = tf.split(self.rnn,2,1)
# AW(全连接层参数)是 [h_size//2(256),4] 的以0为均值,1为标准差服从正态分布的向量。
self.AW = tf.Variable(tf.random_normal([h_size//2,4]))
# VW(全连接层参数)是 [h_size//2(256),1] 的以0为均值,1为标准差服从正态分布的向量。
self.VW = tf.Variable(tf.random_normal([h_size//2,1]))
# 做全连接层矩阵乘法得到 advantage(1*4)
self.Advantage = tf.matmul(self.streamA,self.AW)
# 做全连接层矩阵乘法得到 value(1*1)
self.Value = tf.matmul(self.streamV,self.VW)
# 计算输出的 advantage 与最初输入网络的 imageIn 之间的梯度。
self.salience = tf.gradients(self.Advantage,self.imageIn)
#Then combine them together to get our final Q-values.
# 最终输出的 Q 是 value+(advantage-mean_advantage)(取均值是单独对每行(即每个动作)取均值)
self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
# 最终输出的 prediction action 为 Q 值最大的action
self.predict = tf.argmax(self.Qout,1)
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
# 接下来通过 target Q value 和 prediction Q value 之间的差平方的和获得 loss
# 占位符
self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
# 占位符,用于接收 experience buffer samples 的 actions
self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
# 将 actions 转化为 1*4 的 onehot 编码(action 对应的 index 为1)
self.actions_onehot = tf.one_hot(self.actions,4,dtype=tf.float32)
# Q 值为输出的 Q table 和 action onehot 的对应元素乘积得到的矩阵对列方向进行求和得到的值
self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
# error 定义为 targetQ 和 Q 的差值的平方
self.td_error = tf.square(self.targetQ - self.Q)
#In order to only propogate accurate gradients through the network, we will mask the first
#half of the losses for each trace as per Lample & Chatlot 2016
# 为了仅通过网络传播准确的梯度,我们将根据Lample&Chatlot 2016掩盖每条迹线的前一半损耗
# 掩码掩盖前一半的 loss
self.maskA = tf.zeros([self.batch_size,self.trainLength//2])
self.maskB = tf.ones([self.batch_size,self.trainLength//2])
# 拼接两部分掩码
self.mask = tf.concat([self.maskA,self.maskB],1)
# 将掩码 reshape 成行向量
self.mask = tf.reshape(self.mask,[-1])
# 将 error 与掩码对应位相乘得到 loss
self.loss = tf.reduce_mean(self.td_error * self.mask)
# 学习速率1e-4
self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
# Adam(自适应矩估计(Adaptive Moment Estimation))优化器优化预测Q值和目标Q值偏差
self.updateModel = self.trainer.minimize(self.loss)
Experience Replay
class experience_buffer():
def __init__(self, buffer_size = 1000):
# experience buffer 初始化为空
self.buffer = []
# buffer 尺寸
self.buffer_size = buffer_size
# 向 buffer 加入元素
def add(self,experience):
# 若当前 buffer 加入元素后的长度超过限定长度
if len(self.buffer) + 1 >= self.buffer_size:
# 清除超出限定长度的早先样本
self.buffer[0:(1+len(self.buffer))-self.buffer_size] = []
# 将样本加入
self.buffer.append(experience)
# 从 buffer 中采样样本
def sample(self,batch_size,trace_length):
# 随机采样 batch_size 个 episodes
sampled_episodes = random.sample(self.buffer,batch_size)
sampledTraces = []
for episode in sampled_episodes:
# point 是采样到的 episode 中随机产生的 trace 起点
point = np.random.randint(0,len(episode)+1-trace_length)
# 记录采样到的指定长度的 trace
sampledTraces.append(episode[point:point+trace_length])
sampledTraces = np.array(sampledTraces)
# 将采样到的 traces reshape 成 (batch_size*trace_length)*5
return np.reshape(sampledTraces,[batch_size*trace_length,5])
Training the network
Setting the training parameters
#How many experience traces to use for each training step.
batch_size = 4
#How long each experience trace will be when training
trace_length = 8
#How often to perform a training step.
update_freq = 5
#Discount factor on the target Q-values
y = .99
#Starting chance of random action
startE = 1
#Final chance of random action
endE = 0.1
#How many steps of training to reduce startE to endE.
anneling_steps = 10000
#How many episodes of game environment to train network with.
num_episodes = 10000
#How many steps of random actions before training begins.
pre_train_steps = 10000
#Whether to load a saved model.
load_model = False
#The path to save our model to.
path = "./drqn"
#The size of the final convolutional layer before splitting it into Advantage and Value streams.
h_size = 512
#The max allowed length of our episode.
max_epLength = 50
#Length of each step used in gif creation
time_per_step = 1
#Number of epidoes to periodically save for analysis
summaryLength = 100
tau = 0.001
Start training
# 清除默认图形堆栈并重置全局默认图形
tf.reset_default_graph()
#We define the cells for the primary and target q-networks
# 对于 online-net 的 RNN cell
cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)
# 对于 target-net 的 RNN cell
cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)
# 建立 online-net
mainQN = Qnetwork(h_size,cell,'main')
# 建立 target-net
targetQN = Qnetwork(h_size,cellT,'target')
# 模型参数的初始化器
init = tf.global_variables_initializer()
# 实例化一个 saver 对象,训练过程中,定期调用 saver.save 方法,向文件夹中写入包含当前模型中所有可训练变量的 checkpoint 文件,max_to_keep 表明保存的最大 checkpoint 文件数为5
saver = tf.train.Saver(max_to_keep=5)
# 可训练的参数
trainables = tf.trainable_variables()
# 创建更新 target 模型参数操作
targetOps = updateTargetGraph(trainables,tau)
# 创建 experience buffer
myBuffer = experience_buffer()
#Set the rate of random action decrease.
# 开始时采取随机 action 的概率(1)
e = startE
# 每一步随机 action 的概率衰减值 (1-0.1)/10000=0.00009
stepDrop = (startE - endE)/anneling_steps
#create lists to contain total rewards and steps per episode
# 初始化存储每个 episode 的 step 列表
jList = []
# 初始化存储每个 episode 的 reward 列表
rList = []
# 总步数
total_steps = 0
#Make a path for our model to be saved in.
# 若模型保存路径不存在,则创建路径
if not os.path.exists(path):
os.makedirs(path)
##Write the first line of the master log-file for the Control Center
# 编写控制中心主日志文件的第一行
with open('./Center/log.csv', 'w') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL'])
# 创建默认 session(运行tensorflow operation的类)
with tf.Session() as sess:
# 若 load_model 标志位为 True(加载已保存的model)
if load_model == True:
print ('Loading Model...')
# 加载已保存的模型
ckpt = tf.train.get_checkpoint_state(path)
saver.restore(sess,ckpt.model_checkpoint_path)
# 运行网络模型的初始化器,初始化模型参数
sess.run(init)
# 更新 target 模型参数,与 primary network 一致 #Set the target network to be equal to the primary network.
updateTarget(targetOps,sess)
# 实验循环开始,遍历 episodes
for i in range(num_episodes):
# 创建 episode 内部 experience buffer
episodeBuffer = []
#Reset environment and get first new observation
# 初始化环境
sP = env.reset()
# 扁平化状态长为 21168 向量
s = processState(sP)
# 初始化 done 标记为0
d = False
# 初始化 episode 内总 reward 为0
rAll = 0
# 初始化 episode 内总 step 为0
j = 0
#Reset the recurrent layer's hidden state
# 重置 RNN 层的隐式状态
state = (np.zeros([1,h_size]),np.zeros([1,h_size]))
#The Q-Network
# 遍历 step,每次迭代进行 action
while j < max_epLength:
# episode 内的 step 总数加1
j+=1
#Choose an action by greedily (with e chance of random action) from the Q-network
# 若当前总步数小于 pre_train_steps 或概率为 e 的情况下,随机选择下一步 action
if np.random.rand(1) < e or total_steps < pre_train_steps:
# 输入当前状态,更新 RNN 状态
state1 = sess.run(mainQN.rnn_state,\
feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})
# 随机选择下一步 action
a = np.random.randint(0,4)
else:
# 输入当前状态,根据网络预测选择下一步 action,并更新 RNN 状态
a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],\
feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})
a = a[0]
# 执行 action,得到下一状态、reward、done标志
s1P,r,d = env.step(a)
# 扁平化下一状态
s1 = processState(s1P)
# 总步数加1
total_steps += 1
# 将 step 加入 episode buffer
episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d]),[1,5]))
# 若总步数大于 pre_train_steps
if total_steps > pre_train_steps:
# 采取随机 action 的概率大于最小随机 action 概率
if e > endE:
# 继续减小随机 action 概率
e -= stepDrop
# 若总步数是更新频率的整数倍
if total_steps % (update_freq) == 0:
# 更新 target 模型参数
updateTarget(targetOps,sess)
#Reset the recurrent layer's hidden state
# 重置 RNN 层的隐式状态
state_train = (np.zeros([batch_size,h_size]),np.zeros([batch_size,h_size]))
#Get a random batch of experiences. 从 experience buffer 中随机采样一个batch_size样本
trainBatch = myBuffer.sample(batch_size,trace_length)
#Below we perform the Double-DQN update to the target Q-values
# 接下来对目标Q值执行Double-DQN更新
# 将当前状态输入主模型,得到action
Q1 = sess.run(mainQN.predict,feed_dict={\
mainQN.scalarInput:np.vstack(trainBatch[:,3]/255.0),\
mainQN.trainLength:trace_length,mainQN.state_in:state_train,mainQN.batch_size:batch_size})
# 将当前状态输入 target-net,得到 s1 状态下所有 action 的 Q 值
Q2 = sess.run(targetQN.Qout,feed_dict={\
targetQN.scalarInput:np.vstack(trainBatch[:,3]/255.0),\
targetQN.trainLength:trace_length,targetQN.state_in:state_train,targetQN.batch_size:batch_size})
# done:0 not done:1
end_multiplier = -(trainBatch[:,4] - 1)
# target-net 输出的下一步的 Q
doubleQ = Q2[range(batch_size*trace_length),Q1]
# 当前 reward+y(discount)*doubleQ*end_multiplier
targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
#Update the network with our target values.
# 根据 target 更新网络
sess.run(mainQN.updateModel, \
feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]/255.0),mainQN.targetQ:targetQ,\
mainQN.actions:trainBatch[:,1],mainQN.trainLength:trace_length,\
mainQN.state_in:state_train,mainQN.batch_size:batch_size})
# 累积 reward
rAll += r
# 更新当前状态
s = s1
# 更新当前状态(扁平化后)
sP = s1P
# 更新 RNN 状态
state = state1
# 若 done 标记有效,直接中断实验
if d == True:
break
#Add the episode to the experience buffer 将episode 加入 experience buffer
bufferArray = np.array(episodeBuffer)
episodeBuffer = list(zip(bufferArray))
myBuffer.add(episodeBuffer)
jList.append(j)
rList.append(rAll)
#Periodically save the model.
# 定期保存模型
if i % 1000 == 0 and i != 0:
saver.save(sess,path+'/model-'+str(i)+'.cptk')
print ("Saved Model")
if len(rList) % summaryLength == 0 and len(rList) != 0:
print (total_steps,np.mean(rList[-summaryLength:]), e)
saveToCenter(i,rList,jList,np.reshape(np.array(episodeBuffer),[len(episodeBuffer),5]),\
summaryLength,h_size,sess,mainQN,time_per_step)
saver.save(sess,path+'/model-'+str(i)+'.cptk')
代码采用了 DDQN 的方法实现 Q-net,其与 simple DQN 的区别在于使用两个网络 main-QN 和 target-QN,main-QN 用于选择 action ,target-QN 用于评估价值。
simple DQN:
DDQN:
DQN中均使用了 max 操作,使得选择和评估一个动作值都会过高估计,为解决该问题,DDQN 通过将选择动作和评估动作分割开来避免过高估计的问题。
Testing the network
Setting the testing parameters
#The chance of chosing a random action
e = 0.01
#How many episodes of game environment to train network with.
num_episodes = 10000
#Whether to load a saved model.
load_model = True
#The path to save/load our model to/from.
path = "./drqn"
#The size of the final convolutional layer before splitting it into Advantage and Value streams.
h_size = 512
#The max allowed length of our episode.
max_epLength = 50
#Length of each step used in gif creation
time_per_step = 1
#Number of epidoes to periodically save for analysis
summaryLength = 100
Start testing
# 清除默认图形堆栈并重置全局默认图形
tf.reset_default_graph()
# 对于 online-net 的 RNN cell
cell = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)
# 对于 target-net 的 RNN cell
cellT = tf.contrib.rnn.BasicLSTMCell(num_units=h_size,state_is_tuple=True)
# 建立 online-net
mainQN = Qnetwork(h_size,cell,'main')
# 建立 target-net
targetQN = Qnetwork(h_size,cellT,'target')
# 模型参数的初始化器
init = tf.global_variables_initializer()
# 实例化一个 sarver 对象,训练过程中,定期调用 saver.save 方法,向文件夹中写入包含当前模型中所有可训练变量的 checkpoint 文件,max_to_keep 表明保存的最大 checkpoint 文件数为2
saver = tf.train.Saver(max_to_keep=2)
#create lists to contain total rewards and steps per episode
# 初始化存储每个 episode 的 step 列表
jList = []
# 初始化存储每个 episode 的 reward 列表
rList = []
# 总步数
total_steps = 0
#Make a path for our model to be saved in.
# 若模型保存路径不存在,则创建路径
if not os.path.exists(path):
os.makedirs(path)
##Write the first line of the master log-file for the Control Center
# 编写控制中心主日志文件的第一行
with open('./Center/log.csv', 'w') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL'])
#wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL)
# 创建默认 session(运行tensorflow operation的类)
with tf.Session() as sess:
# 若 load_model 标志位为 True(加载已保存的model)
if load_model == True:
print ('Loading Model...')
ckpt = tf.train.get_checkpoint_state(path)
# 加载已保存的模型
saver.restore(sess,ckpt.model_checkpoint_path)
else:
# 运行网络模型的初始化器,初始化模型参数
sess.run(init)
# 实验循环开始,遍历 episodes
for i in range(num_episodes):
# 创建 episode 内部 experience buffer
episodeBuffer = []
#Reset environment and get first new observation
# 初始化环境
sP = env.reset()
# 扁平化状态长为 21168 向量
s = processState(sP)
# 初始化 done 标记为0
d = False
# 初始化 episode 内总 reward 为0
rAll = 0
# 初始化 episode 内总 step 为0
j = 0
#Reset the recurrent layer's hidden state
# 重置 RNN 层的隐式状态
state = (np.zeros([1,h_size]),np.zeros([1,h_size]))
#The Q-Network
#If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
# 遍历 step,每次迭代进行 action
while j < max_epLength:
# episode 内的 step 总数加1
j+=1
#Choose an action by greedily (with e chance of random action) from the Q-network
# 若当前总步数小于 pre_train_steps 或概率为 e 的情况下,随机选择下一步 action
if np.random.rand(1) < e:
# 输入当前状态,更新 RNN 状态
state1 = sess.run(mainQN.rnn_state,\
feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,mainQN.state_in:state,mainQN.batch_size:1})
# 随机选择下一步 action
a = np.random.randint(0,4)
else:
# 输入当前状态,根据网络预测选择下一步 action,并更新 RNN 状态
a, state1 = sess.run([mainQN.predict,mainQN.rnn_state],\
feed_dict={mainQN.scalarInput:[s/255.0],mainQN.trainLength:1,\
mainQN.state_in:state,mainQN.batch_size:1})
a = a[0]
# 执行 action,得到下一状态、reward、done标志
s1P,r,d = env.step(a)
# 扁平化下一状态
s1 = processState(s1P)
# 总步数加1
total_steps += 1
#Save the experience to our episode buffer.
# 将 step 加入 episode buffer
episodeBuffer.append(np.reshape(np.array([s,a,r,s1,d]),[1,5]))
# 累积 reward
rAll += r
# 更新当前状态
s = s1
# 更新当前状态(扁平化后)
sP = s1P
# 更新 RNN 状态
state = state1
# 若 done 标记有效,直接中断实验
if d == True:
break
# 存储当前 episode 的步数和 reward
bufferArray = np.array(episodeBuffer)
jList.append(j)
rList.append(rAll)
#Periodically save the model.
# 定期保存模型
if len(rList) % summaryLength == 0 and len(rList) != 0:
print (total_steps,np.mean(rList[-summaryLength:]), e)
saveToCenter(i,rList,jList,np.reshape(np.array(episodeBuffer),[len(episodeBuffer),5]),\
summaryLength,h_size,sess,mainQN,time_per_step)
print ("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")