Deep Q-Learning(DQN)
一、什么是DQN
(一)为什么出现了DQN?
在Q-Learning和Sarsa算法中,我们使用了一种数据结构:Q表,用Q表存储所有的状态以及每个action的Q值。在现实问题中,不仅action种类可能非常多,state的数量更可能是指数级,这就为我们存储Q表和查找对应状态带来了很大的麻烦。在机器学习中,我们可以将state和action输入到神经网络中,神经网络分析后输出action的Q值,然后我们再根据Q-learning的原则执行之后的操作。
通过神经网络,我们省去了存储和查找的时间,同时网络还能够捕捉到一些细节特征,极大的促进了强化学习的发展。
(二)网络是如何更新的?
在Q-Learning中,我们更新参数时,需要两个值:Q现实和Q估计,这两个值都记录在Q表中。
在DQN中,我们要更新神经网络的参数,同样需要这两个值,那如何获得呢?
首先我们通过神经网络预测出Q估计:
Q
(
s
,
a
1
)
Q
(
s
,
a
2
)
Q(s,a1)\ \ \ Q(s,a2)
Q(s,a1) Q(s,a2)
Q现实也是神经网络预测出的Q值,不过是对s的下一步s’的估计,一般由target network(后面会提到)给出:
Q
现
实
=
R
+
γ
∗
max
[
Q
(
s
′
,
a
1
)
,
Q
(
s
′
,
s
2
)
]
Q_{现实}=R+\gamma*\max[Q(s',a1),Q(s',s2)]
Q现实=R+γ∗max[Q(s′,a1),Q(s′,s2)]
更新参数:
α
(
Q
现
实
−
Q
估
计
)
\alpha(Q_{现实}-Q_{估计})
α(Q现实−Q估计)
(三)两大法宝为DQN锦上添花
- Experience replay
为DQN建立一个记忆库,将过去或者别人的一些学习经验存在记忆库中,在每次更新神经网络时,可以抽取一些经历进行学习,这样打乱了经历之间的相关性, 也使得神经网络更新更有效率。 - Fixed Q-targets
同样是用来打乱了经历之间的相关性。我们在DQN中建立两个结构相同参数不同的神经网络。预测 Q 估计 的神经网络具备最新的参数, 而预测 Q 现实 的神经网络使用的参数则是很久以前的参数。
二、DQN的算法过程
Initialize replay memort
D
D
D to capacity
N
N
N
Initialize action-value function
Q
Q
Q with randon weights
θ
\theta
θ
Initialize target action-value function
Q
^
\hat{Q}
Q^ with weight
θ
−
=
θ
\theta^-=\theta
θ−=θ
For episode = 1,M do:
Initialize sequence
s
1
=
{
x
1
}
s_1=\{x_1\}
s1={x1} and preprocessed sequence
ϕ
=
ϕ
(
s
1
)
\phi=\phi(s_1)
ϕ=ϕ(s1)
For t=1,T *do
With probablity
ε
\varepsilon
ε select a random action
a
t
a_t
at
otherwise select
a
t
=
arg
max
a
Q
(
ϕ
(
s
t
)
,
a
:
θ
)
a_t=\arg\ \max_aQ(\phi(s_t),a:\theta)
at=arg maxaQ(ϕ(st),a:θ)
Execute action
a
t
a_t
at in emulator and observe reward
r
t
r_t
rt and image
x
t
+
1
x_{t+1}
xt+1
Set
s
t
+
1
=
s
t
,
a
t
,
x
t
+
1
s_{t+1}=s_t,a_t,x_{t+1}
st+1=st,at,xt+1 and preprocess
ϕ
t
+
!
=
ϕ
(
s
t
+
1
)
\phi_{t+!}=\phi(s_{t+1})
ϕt+!=ϕ(st+1)
Store transition
(
ϕ
t
,
a
t
,
r
t
,
ϕ
t
+
1
)
(\phi_t,a_t,r_t,\phi_{t+1})
(ϕt,at,rt,ϕt+1) in
D
D
D
Sample random minibatch of transitions
(
ϕ
j
,
a
j
,
r
j
,
ϕ
j
+
1
)
(\phi_j,a_j,r_j,\phi_{j+1})
(ϕj,aj,rj,ϕj+1) from
D
D
D
Set
y
j
=
{
r
j
i
f
e
p
i
s
o
d
e
t
e
r
m
i
m
a
t
e
s
a
t
s
t
e
p
j
+
1
r
j
+
γ
max
a
′
Q
^
(
ϕ
j
+
1
,
a
′
:
θ
−
)
o
t
h
e
r
w
i
z
e
y_j=\left\{\begin{aligned} &r_j&if\ episode\ termimates\ at\ step\ j+1\\ &r_j+\gamma\max_{a'}\hat{Q}(\phi_{j+1},a':\theta^-) &otherwize \end{aligned}\right.
yj=⎩⎨⎧rjrj+γa′maxQ^(ϕj+1,a′:θ−)if episode termimates at step j+1otherwize
Permorm a gradient descent step on
(
y
j
−
Q
(
ϕ
j
,
a
j
:
θ
)
)
2
(y_j-Q(\phi_j,a_j:\theta))^2
(yj−Q(ϕj,aj:θ))2 with respect to the network parameters
θ
\theta
θ
Every
C
C
C steps reset
Q
^
=
Q
\hat{Q}=Q
Q^=Q
End For
End For
(三)实现代码
详细信息可查看 莫烦 强化学习
环境代码详见 maze_env.py
【reinforcement learning】Q-Learning简介
main.py
from maze_env import Maze
from RL_brain import DeepQNetwork
def run_maze():
step = 0
for episode in range(600):
observation = env.reset()
while True:
env.render()
action = RL.choose_action(observation)
observation_,reward,done = env.step(action)
RL.store_transition(observation,action,reward,observation_)
if(step > 200) and (step % 5 ==0):
RL.learn()
observation = observation_
if done:
break
step += 1
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = DeepQNetwork(env.n_actions,env.n_features,learning_rate=0.01,
reward_decay=0.9,e_greedy=0.9,replace_target_iter=200,
memory_size=2000)
env.after(100,run_maze)
env.mainloop()
RL.plot_cost()
RL_brain.py
import numpy as np
import pandas as pd
import tensorflow as tf
np.random.seed(1)
tf.set_random_seed(1)
# Deep Q Network off-policy
class DeepQNetwork:
def __init__(
self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.learn_step_counter = 0
self.memory =np.zeros((self.memory_size,n_features*2+2))
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
self.sess = tf.Session()
if output_graph:
tf.summary.FileWriter("./logs/",self.sess.graph)
self.sess.run(tf.global_variables_initializer())
self.cost_his = []
def _build_net(self):
# enval_network
self.s = tf.placeholder(tf.float32,[None,self.n_features],name='s')
self.q_target = tf.placeholder(tf.float32,[None,self.n_actions],name='Q_target')
with tf.variable_scope('eval_net'):
c_names,n_l1,w_initializer,b_initializer = ['eval_net_params',tf.GraphKeys.GLOBAL_VARIABLES],10,tf.random_normal_initializer(0.,0.3),tf.constant_initializer(0.1)
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1',[self.n_features,n_l1],initializer=w_initializer,collections=c_names)
b1 = tf.get_variable('b1',[1,n_l1],initializer=b_initializer,collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s,w1)+b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2',[n_l1,self.n_actions],initializer=w_initializer,collections=c_names)
b2 = tf.get_variable('b2',[1,self.n_actions],initializer=b_initializer,collections=c_names)
self.q_eval = tf.matmul(l1,w2)+b2
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target,self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
#target_network
self.s_ = tf.placeholder(tf.float32,[None,self.n_features],name='s_')
with tf.variable_scope('target_net'):
c_names = ['target_net_params',tf.GraphKeys.GLOBAL_VARIABLES]
with tf.variable_scope('l1'):
w1 = tf.get_variable('W1',[self.n_features,n_l1],initializer=w_initializer,collections=c_names)
b1 = tf.get_variable('b1',[1,n_l1],initializer=b_initializer,collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s_,w1)+b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2',[n_l1,self.n_actions],initializer=w_initializer,collections=c_names)
b2 = tf.get_variable('b2',[1,self.n_actions],initializer=b_initializer,collections=c_names)
self.q_next = tf.matmul(l1,w2)+b2
def store_transition(self,s,a,r,s_):
if not hasattr(self,'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s,[a,r],s_))
index = self.memory_counter % self.memory_size
self.memory[index,:] = transition
self.memory_counter += 1
#
#
def choose_action(self,observation):
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
actions_value = self.sess.run(self.q_eval , feed_dict={self.s : observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0,self.n_actions)
return action
def _replace_target_params(self):
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.sess.run([tf.assign(t,e) for t,e in zip(t_params,e_params)])
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self._replace_target_params()
print('\nreplace......\n')
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size,size=self.memory_size)
else:
sample_index = np.random.choice(self.memory_counter,self.batch_size)
batch_memory = self.memory[sample_index,:]
q_next,q_eval = self.sess.run(
[self.q_next,self.q_eval],
feed_dict={
self.s_ : batch_memory[:,-self.n_features],
self.s : batch_memory[:,self.n_features]
}
)
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size,dtype=np.int32)
eval_act_index = batch_memory[:,self.n_features].astype(int)
reward = batch_memory[:,self.n_features+1]
q_target[batch_index,eval_act_index] = reward + self.gamma*np.max(q_next,axis=1)
_, self.cost = self.sess.run([self._train_op,self.loss],
feed_dict={
self.s : batch_memory.ioc[:,:self.n_features],
self.q_target:q_target
})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()
def choose_action(self, observation):
# to have batch dimension when feed into tf placeholder
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
# sample batch memory from all memory
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={
self.s_: batch_memory[:, -self.n_features:], # fixed params
self.s: batch_memory[:, :self.n_features], # newest params
})
# change q_target w.r.t q_eval's action
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
# train eval network
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()