红色块移动到黄色,黑色为障碍物
马尔科夫链,
预测最好的路径,值函数为回报r(reward)和the discounted value of the ending state
SARSA代表state, action, reward, next state和next action。it is known as an own policy Reinforcement Learning。An own policy means that we can see only our own experiences。It accumulates updates in one or more steps and learns to update from its experiences。
从当前状态,我们选择一个行为和下一步状态。在下一个状态,我们选择另一个状态和使用当前状态和
算法:
- 初始化Q(s, a)
- 初始化s
- 通过s选择行为a,在每轮中重复这两步骤
- 从a、r和s'中学习Q Learing
- 在每轮中重复以上步骤
Q Learning
Q由s和a组成的一个表格,Q代表状态s下执行行为a的值
Q[s, a] = Immediate reward + discounted reward
The Immediate reward是一个状态到另一个状态执行行为的,discounted reward是未来的
Qtable决定在当前状态s下哪个行为a是最优的
Pi(s) = ARGMAX a (Q[s, a])
Pi(a | s) = P[ At=a | St=s ]
def learn(self, s, a, r, s_, a_)
迷宫maze.py
import numpy as np
import time
import sys
if sys.version_info.major==2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # 高
MAZE_W = 4 # 宽
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_H*UNIT, MAZE_W*UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height = MAZE_H * UNIT,
width = MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H*UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_H*UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT*2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black'
)
# hell
hell2_center = origin + np.array([UNIT, UNIT*2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black'
)
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow'
)
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='yellow'
)
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red'
)
# retrun observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1)*UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1)*UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
# 移动
self.canvas.move(self.rect, base_action[0], base_action[1])
s_ = self.canvas.coords(self.rect)
# 奖励函数
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update() # 更新页面
rl.py
import numpy as np
import pandas as pd
class RL(object):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = actions
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def choose_action(self, observation):
self.check_state_exist(observation)
if np.random.uniform() < self.epsilon:
state_action = self.q_table.loc[observation, :]
action = np.random.choice(state_action[state_action==np.max(state_action)].index)
else:
action = np.random.choice(self.actions)
return action
def learn(self, *args):
Pass
def check_state_exist(self, state):
if state not in self.q_table.index:
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
# off-policy
class QLearningTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
super(QLearningTable, self).__init__(actions, learning_rate,
reward_decay, e_greedy)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.ix[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.ix[s_, :].max()
else:
q_target = r
self.q_table.ix[s,a] += self.lr * (q_target - q_predict) # 更新
# on-policy
class SarsaTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
super(SarsaTable, self).__init__(actions, learning_rate,
reward_decay, e_greedy)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.ix[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.ix[s_, a_]
else:
q_target = r
self.q_table.ix[s,a] += self.lr * (q_target - q_predict) # 更新
主函数run.py
from maze import Maze
from rl import QLearningTable, SarsaTable
def update():
for episode in range(100):
observation = env.reset()
action = RL.choose_action(str(observation))
while True:
env.render()
observation_, reward, done = env.step(action)
action_ = RL.choose_action(str(observation_))
# (s, a, r, s, a) == Sarsa
RL.learn(str(observation), action, reward, str(observation_),
action_)
observation = observation_
action = action_
if done:
break
print('game over')
env.destory()
if __name__ == '__main__':
env = Maze()
# RL = QLearningTable(actions=list(range(env.n_actions)))
RL = SarsaTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()