(代码都是参考莫烦python:GitHub - MorvanZhou/Reinforcement-learning-with-tensorflow)
if __name__ == "__main__":
# maze game
env = Maze()
RL = DeepQNetwork(env.n_actions, env.n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200,
memory_size=2000,
# output_graph=True
)
env.after(100, run_maze)
env.mainloop()
RL.plot_cost()
Step1:根据代码 env = Maze() 创建环境(这一步不着重解释了)
Step2:创建好环境后,开始神经网络的初始化(RL = DeepQNetwork(......))
step1:初始化一些参数
step2:搭建神经网络框架 eval_net 和 target_net,由代码可以看出,在这一步及之前,都没有s,s_,a,r等数据的输入,这一步仅是框架的搭建。
def _build_net(self):
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input;
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_eval = tf.matmul(l1, w2) + b2
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'): #梯度下降
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_next = tf.matmul(l1, w2) + b2
Step3:神经网络的初始化结束后,智能体开始与环境互动 env.after(100, run_maze)
def run_maze():
step = 0
for episode in range(300):
# initial observation
observation = env.reset()
while True:
# fresh env
env.render()
# RL choose action based on observation
action = RL.choose_action(observation)
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
RL.store_transition(observation, action, reward, observation_)
if (step > 200) and (step % 5 == 0):
RL.learn()
# swap observation
observation = observation_
# break while loop when end of this episode
if done:
break
step += 1
# end of game
print('game over')
env.destroy()
step1:设定300个回合,每个回合的步数没有设定(每个回合的结束是根据最新状态来决定的。if done: break)
step2:初始化环境,并获取当前状态信息 observation = env.reset()
step3:开始第一回合,根据状态信息选择动作 action = RL.choose_action(observation),
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
# 因为observation加入时是一维的数值, np.newaxis为numpy.ndarray(多维数组)增加一个轴, 多加入了一个行轴
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
# 该代码是TensorFlow中使用sess.run()运行计算图(图中定义了占位符 self.s和评估网络)来计算给定观察值observation的评估行动价值。
# 其中, self.q_eval是评估网络的输出值, feed_dict参数是一个Python字典,用于将观察值observation传递给占位符变量self.s。
# 具体来说,这段代码的作用是根据当前状态(给定的观察值observation),通过评估网络(定义为self.q_eval),计算出每个可能行动的预期价值,并返回对应的行动价值向量。
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
由代码可以可以看出,动作是根据行为策略选出来的(if np.random.uniform() < self.epsilon:.....else:......),其中这个Q值是由神经网络生成的,即输入状态信息s,神经网络输出Q值(actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})),且这个Q值还只是估计出来的,并不是训练出来的(self.q_eval, feed_dict={self.s: observation}))
step4:智能体执行动作后,环境发生了变化,可以得到,此刻的环境是s_,获得的奖励是r(observation_, reward, done = env.step(action))
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
next_coords = self.canvas.coords(self.rect) # next state
if next_coords == self.canvas.coords(self.oval):
reward = 1
done = True
elif next_coords in [self.canvas.coords(self.hell1)]:
reward = -1
done = True
else:
reward = 0
done = False
s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
return s_, reward, done
step5:(s,a,r,s_)被存入经验池 (RL.store_transition(observation, action, reward, observation_),当经验池中的数据少于200条时,智能体将不会进行学习。当经验池中的数据超过200条时,每走5步,智能体学习一次 if (step > 200) and (step % 5 == 0): RL.learn()
(这个def learn(self)代码,确实不大理解,还在学习中,有会的请教教我)
def learn(self): # 如何学习, 更新参数的. 这里涉及了 target_net 和 eval_net 的交互使用.
# check to replace target parameters, 提前检测是否替换 target_net 参数,self.learn_step_counter记录了步数
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op) # 隔self.replace_target_iter后将target net 的参数更新为最新的参数
print('\ntarget_params_replaced\n')
# sample batch memory from all memory
if self.memory_counter > self.memory_size: # 如果需要记忆的步数超过记忆库容量
# 从给定的一维阵列self.memory_size生成一个随机样本,size为Output shape.
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
# 步数未超过记忆总容量,则最多在self.memory_counter个记忆值中选择32个索引数值
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
# 运行这两个神经网络
q_next, q_eval = self.sess.run([self.q_next, self.q_eval],
feed_dict={
self.s_: batch_memory[:, -self.n_features:],
# fixed params; q_next由目标值网络用记忆库中倒数n_features个列(observation_)的值做输入
self.s: batch_memory[:, :self.n_features],
# newest params; q_eval由预测值网络用记忆库中正数n_features个列(observation)的值做输入
})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
step6:observation = observation_
step7:判断该回合是否结束,如果没有则重复上述行为。如果回合结束则开始新的回合,直到达到300回合。