1.算法思想
Q learning 算法是一种value-based的强化学习算法,Q是quality的缩写,Q函数 Q(state,action)表示在状态state下执行动作action的quality, 也就是能获得的Q value是多少。算法的目标是最大化Q值,通过在状态state下所有可能的动作中选择最好的动作来达到最大化期望reward。
Q learning算法使用Q table来记录不同状态下不同动作的预估Q值。在探索环境之前,Q table会被随机初始化,当agent在环境中探索的时候,它会用贝尔曼方程来迭代更新Q(s,a), 随着迭代次数的增多,agent会对环境越来越了解,Q 函数也能被拟合得越来越好,直到收敛或者到达设定的迭代结束次数。
2.算法设计
1)应用设计:
给出一个4x4的方格,给出起点和终点,红色代表物体,黑色代表陷阱,黄色代表终点。要求物体在不掉进陷阱的情况下尽可能最短路径到达终点。我们的最终目的就是得到一个训练得比较好的Q table,Q table里面的不同位置的值就是我们要训练的结果。当模型训练好了以后,agent会学会怎么去玩这个游戏,我们就可以应用此模型了。当开始一局的新的游戏,agent会根据Q table去查找到达目的地的最优路径。
2)Q table:
我们使用Q table来存储agent在不同state下选择不同动作可以获得的Q value。state是指物体所在的位置,action是物体在这个位置上所有能选择的动作。表的每一行表示一个state,每一列表示一个action。表中的值表示在这个state和action的最大期望未来reward。Q table最开始的时候会被初始化,比如初始化为0。如下图所示
3) 选择action:
这里会采用一个exploitation-exploration的方法,它用的Ƴ-greedy 策略选择action。
exploration:探索未知的领域,比如在某个state下随机选择一个action。exploitation :根据当前的信息,由训练的模型做出最佳的决策,即选择Q value最大的动作
做exploitation和exploration的目的是获得一种长期收益最高的策略,这个过程可能对short-term reward有损失。如果exploitation太多,那么模型比较容易陷入局部最优,但是exploration太多,模型收敛速度太慢。这就是exploitation-exploration权衡。
比如我们设Ƴ=0.9,随机化一个[0,1]的值,如果它小于,则进行exploration,随机选择动作;如果它大于,则进行exploitation,选择Q value最大的动作。
在训练过程中,在刚开始的时候会被设得比较大,让agent充分探索,然后逐步减少,agent会开始慢慢选择Q value最大的动作
由于刚开始,Ƴ比较大,agent随机选择一个action。假如在start位置时,agent选择了往右走的动作,到达(0,1)的位置。
4)Q value更新:
agent从start位置执行一个right动作,走到(0,1)位置,得到了一个实时奖励 + 1分,然后我们更新Q table里第一行第二列的值。更新的方法是用贝尔曼方程(Bellman equation),下面是Q learning算法更新的方法:
取,。代入贝尔曼方程(Bellman equation):
从而更新Q table如下:
agent在每一个step的时候都会用上面的方法迭代更新一次Q table,直到Q table不在更新或者到达游戏设置的结束局数。
3.代码设计
4.代码实现
1)算法实现:brain.py
import numpy as np
import pandas as pd
class QLearningTable:
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = actions # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.uniform() < self.epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
# some actions may have the same value, randomly choose on in these actions
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
2)表格绘制:
Draw.py其中,Death:[reward -1]
Not death[reward +1]
Success [reward+10]
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_W * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update()
def update():
for t in range(10):
s = env.reset()
while True:
env.render()
a = 1
s, r, done = env.step(a)
if done:
break
if __name__ == '__main__':
env = Maze()
env.after(100, update)
env.mainloop()
3)运行:
from maze_env import Maze
from RL_brain import QLearningTable
def update():
for episode in range(100):
# initial observation
observation = env.reset()
while True:
# fresh env
env.render()
# RL choose action based on observation
action = RL.choose_action(str(observation))
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
# RL learn from this transition
RL.learn(str(observation), action, reward, str(observation_))
# swap observation
observation = observation_
# break while loop when end of this episode
if done:
break
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop();