强化学习
Q-Learning
简单流程
'''
# T 就是宝藏的位置, o 是探索者的位置
(主循环)伪代码:
Initalize Q(s, a) arbitrarily #Q(s, a):Q表 每个状态对应的动作的Q值
Repeat (for each episode):
Initialize S
Repeat (for each step of episode):
Choose a from s using policy derived from Q (e.g.,E-greedy)
Take action a, observe r, s'
Q(s,a)←Q(s,a) +a[r + gamma*maxa' Q(s',a')- Q(s,a)]
s←s';
until S is terminal
Q-learning 是一种记录行为值 (Q value) 的方法, 每种在一定状态的行为都会有一个值 Q(s, a), 就是说 行为 a 在 s 状态的值是 Q(s, a).
s 在上面的探索者游戏中, 就是 o 所在的地点了. 而每一个地点探索者都能做出两个行为 left/right, 这就是探索者的所有可行的 a 啦.
如果在某个地点 s1, 探索者计算了他能有的两个行为, a1/a2=left/right, 计算结果是 Q(s1, a1) > Q(s1, a2), 那么探索者就会选择 left 这个行为.
这就是 Q learning 的行为选择简单规则.
'''
import numpy as np
from numpy.testing._private.utils import jiffies
import pandas as pd
import time
#预设值 参数
np.random.seed(2) #reproducible 产生一组伪随机数列
N_STATES = 6 #一维世界的宽度 (起始点到目标点的距离)
ACTIONS = ['left','right'] #探索者的可用动作
EPSILON = 0.9 #贪婪度
ALPHA = 0.1 # 学习率
GAMMA = 0.9 # 衰减率(奖励递减值)
MAX_EPISODES = 13 # 最大迭代数
FRESH_TIME = 0.01 # 移动时间间隔
#建立Q表
def build_q_table(n_states, actions):
table = pd.DataFrame(
np.zeros((n_states, len(actions))), #初始化为 0 表大小为 N_STATES * ACTIONS
columns = actions, #对应的是行为名称
)
return table
"""
Q-table:
left right
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
"""
'''
定义探索者是如何挑选行为的.epsilon greedy:因为在初始阶段, 随机的探索环境, 往往比固定的行为模式要好, 所以这也是累积经验的阶段, 我们希望探索者不会那么贪婪(greedy).
所以 EPSILON 就是用来控制贪婪程度的值.EPSILON 可以随着探索时间不断提升(越来越贪婪), 这个例子中, 固定成 EPSILON = 0.9: 90% 的时间是选择最优策略, 10% 的时间来探索.
'''
#选动作 :根据所在状态和Q-table中的值选择动作 即在某个 state 地点, 选择行为
def choose_action(state, q_table):
state_actions = q_table.iloc[state, :] # 选出这个 state 的所有 action 值
if np.random.uniform() > EPSILON or state_actions.all() == 0: # 非贪婪 or 或者这个 state 还没有探索过
action_name = np.random.choice(ACTIONS) #随机选取一个行动
else:
action_name = state_actions.argmax() #贪婪模式 选择Q值最大的行动
return action_name
'''
创建环境:探索者具体怎么探索
做出行为后, 环境也要给我们的行为一个反馈, 反馈出下个 state (S_) 和 在上个 state (S) 做出 action (A) 所得到的 reward (R).
这里定义的规则就是, 只有当 o 移动到了 T, 探索者才会得到唯一的一个奖励, 奖励值 R=1, 其他情况都没有奖励.
'''
def get_env_feedback(S, A):
if A == 'right':
if S == N_STATES - 2:
S_ = 'terminal' #终止
R = 1 # 奖励为1
else:
S_ = S + 1 #当前状态+1
R = 0
else: #向左移动
R = 0 #奖励为0
if S == 0:
S_ = S
else:
S_ = S - 1
return S_, R
#环境更新
def update_env(S, episode, step_counter):
# This is how environment be updated
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
# 主循环
'''
Initialize Q(s, a) arbitrarily #Q(s, a):Q表 每个状态对应的动作的Q值
Repeat (for each episode):
Initialize S
Repeat (for each step of episode):
Choose a from s using policy derived from Q (e.g.,E-greedy)
Take action a, observe r, s'
Q(s,a)←Q(s,a) +a[r + gamma*max a' Q(s',a')- Q(s,a)]
s←s';
until S is terminal
'''
def RL():
q_table = build_q_table(N_STATES, ACTIONS) # 初始 q table
for episode in range(MAX_EPISODES):
step_counter = 0
S = 0 # 回合初始位置
is_terminated = False # 是否回合结束
update_env(S, episode, step_counter) # 环境更新
while not is_terminated:
A = choose_action(S, q_table) # 选行为
S_, R = get_env_feedback(S, A) # 实施行为并得到环境的反馈
q_predict = q_table.loc[S, A] # 估算的(状态-行为)值
if S_ != 'terminal': #如果没有终止
q_target = R + GAMMA * q_table.loc[S_, :].max() #q_target 为真实值
else:
q_target - R # # 实际的(状态-行为)值 (回合结束)
is_terminated = True
q_table.loc[S, A] += ALPHA * (q_target - q_predict) # q_table 更新
S = S_ # 探索者移动到下一个 state
update_env(S, episode, step_counter + 1) # 环境更新
step_counter += 1
return q_table
if __name__ == '__main__':
q_table = RL()
print('\r\nQ-table:\n')
print(q_table)
运行环境
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example. The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 100 # pixels
MAZE_H = 5 # grid height
MAZE_W = 5 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='green',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update()
def update():
for t in range(10):
s = env.reset()
while True:
env.render()
a = 1
s, r, done = env.step(a)
if done:
break
if __name__ == '__main__':
env = Maze()
env.after(100, update)
env.mainloop()
构建Q-Learning类
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example. The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 100 # pixels
MAZE_H = 5 # grid height
MAZE_W = 5 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='green',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
hell2_center = origin + np.array([UNIT, UNIT * 2])
self.hell2 = self.canvas.create_rectangle(
hell2_center[0] - 15, hell2_center[1] - 15,
hell2_center[0] + 15, hell2_center[1] + 15,
fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.5)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return self.canvas.coords(self.rect)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
s_ = self.canvas.coords(self.rect) # next state
# reward function
if s_ == self.canvas.coords(self.oval):
reward = 1
done = True
s_ = 'terminal'
elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
reward = -1
done = True
s_ = 'terminal'
else:
reward = 0
done = False
return s_, reward, done
def render(self):
time.sleep(0.1)
self.update()
def update():
for t in range(10):
s = env.reset()
while True:
env.render()
a = 1
s, r, done = env.step(a)
if done:
break
if __name__ == '__main__':
env = Maze()
env.after(100, update)
env.mainloop()
运行Q-Learning
'''
让探索者学会走迷宫. 黄色的是天堂 (reward 1), 黑色的地狱 (reward -1). 大多数 RL 是由 reward 导向的, 所以定义 reward 是 RL 中比较重要的一点.
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the main part which controls the update method of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
'''
from maze_env import Maze
from RL_brain import QLearningTable
#Updata
def update():
for episode in range(100):
#初始化 观察值
observation = env.reset()
while True:
#刷新环境
env.render()
#RL 大脑根据 state 的观测值挑选 action
action = RL.choose_action(str(observation))
#探索者在环境中实施这个 action, 并得到环境返回的下一个 state 观测值, reward 和done (是否是掉下地狱或者升上天堂)
observation_next, reward, done = env.step(action)
#RL 从这个序列 (state, action, reward, state_) 中学习
RL.learn(str(observation), action, reward, str(observation_next))
# 将下一个 state 的值传到下一次循环
observation = observation_next
#如果掉下地狱或者升上天堂, 这回合就结束了
if done:
break
# 结束游戏并关闭窗口
print('game over')
env.destroy()
if __name__ == '__main__':
env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
Sarsa
简单流程
'''
# T 就是宝藏的位置, o 是探索者的位置
(主循环)伪代码:
Initialize Q(s, a) arbitrarily #Q(s, a):Q表 每个状态对应的动作的Q值
Repeat (for each episode):
Initialize S
Choose a from s using policy derived from Q (e.g.,E-greedy)
Repeat (for each step of episode):
Take action a, observe r, s'
Choose a' from s' using policy derived from Q (e.g.,E-greedy)
Q(s,a)←Q(s,a) +alpha[r + gamma*Q(s',a')- Q(s,a)]
s←s',a←a';
until S is terminal
Sarsa 相对于 Q-learning, 更加的胆小. 因为 Q-learning 永远都是想着 maxQ 最大化, 因为这个 maxQ 而变得贪婪, 不考虑其他非 maxQ 的结果.
我们可以理解成 Q-learning 是一种贪婪, 大胆, 勇敢的算法, 对于错误, 死亡并不在乎.
而 Sarsa 是一种保守的算法, 他在乎每一步决策, 对于错误和死亡比较铭感. 这一点我们会在可视化的部分看出他们的不同.
两种算法都有他们的好处, 比如在实际中, 你比较在乎机器的损害, 用一种保守的算法, 在训练时就能减少损坏的次数.
'''
import numpy as np
from numpy.testing._private.utils import jiffies
import pandas as pd
import time
#预设值 参数
np.random.seed(2) #reproducible 产生一组伪随机数列
N_STATES = 6 #一维世界的宽度 (起始点到目标点的距离)
ACTIONS = ['left','right'] #探索者的可用动作
EPSILON = 0.9 #贪婪度
ALPHA = 0.1 # 学习率
GAMMA = 0.9 # 衰减率(奖励递减值)
MAX_EPISODES = 13 # 最大迭代数
FRESH_TIME = 0.01 # 移动时间间隔
#建立Q表
def build_s_table(n_states, actions):
table = pd.DataFrame(
np.zeros((n_states, len(actions))), #初始化为 0 表大小为 N_STATES * ACTIONS
columns = actions, #对应的是行为名称
)
return table
"""
Q-table:
left right
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
4 0.0 0.0
5 0.0 0.0
"""
'''
定义探索者是如何挑选行为的.epsilon greedy:因为在初始阶段, 随机的探索环境, 往往比固定的行为模式要好, 所以这也是累积经验的阶段, 我们希望探索者不会那么贪婪(greedy).
所以 EPSILON 就是用来控制贪婪程度的值.EPSILON 可以随着探索时间不断提升(越来越贪婪), 这个例子中, 固定成 EPSILON = 0.9: 90% 的时间是选择最优策略, 10% 的时间来探索.
'''
#选动作 :根据所在状态和Q-table中的值选择动作 即在某个 state 地点, 选择行为
def choose_action(state, q_table):
state_actions = q_table.iloc[state, :] # 选出这个 state 的所有 action 值
if np.random.uniform() > EPSILON or state_actions.all() == 0: # 非贪婪 or 或者这个 state 还没有探索过
action_name = np.random.choice(ACTIONS) #随机选取一个行动
else:
action_name = state_actions.argmax() #贪婪模式 选择Q值最大的行动
return action_name
'''
创建环境:探索者具体怎么探索
做出行为后, 环境也要给我们的行为一个反馈, 反馈出下个 state (S_) 和 在上个 state (S) 做出 action (A) 所得到的 reward (R).
这里定义的规则就是, 只有当 o 移动到了 T, 探索者才会得到唯一的一个奖励, 奖励值 R=1, 其他情况都没有奖励.
'''
def get_env_feedback(S, A):
if A == 'right':
if S == N_STATES - 2:
S_ = 'terminal' #终止
R = 1 # 奖励为1
else:
S_ = S + 1 #当前状态+1
R = 0
else: #向左移动
R = 0 #奖励为0
if S == 0:
S_ = S
else:
S_ = S - 1
return S_, R
#环境更新
def update_env(S, episode, step_counter):
# This is how environment be updated
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
# 主循环
'''
Initialize Q(s, a) arbitrarily #Q(s, a):Q表 每个状态对应的动作的Q值
Repeat (for each episode):
Initialize S
Repeat (for each step of episode):
Choose a from s using policy derived from Q (e.g.,E-greedy)
Take action a, observe r, s'
Q(s,a)←Q(s,a) +a[r + gamma*max a' Q(s',a')- Q(s,a)]
s←s';
until S is terminal
'''
def RL():
q_table = build_q_table(N_STATES, ACTIONS) # 初始 q table
for episode in range(MAX_EPISODES):
step_counter = 0
S = 0 # 回合初始位置
is_terminated = False # 是否回合结束
update_env(S, episode, step_counter) # 环境更新
while not is_terminated:
A = choose_action(S, q_table) # 选行为
S_, R = get_env_feedback(S, A) # 实施行为并得到环境的反馈
q_predict = q_table.loc[S, A] # 估算的(状态-行为)值
if S_ != 'terminal': #如果没有终止
q_target = R + GAMMA * q_table.loc[S_, :].max() #q_target 为真实值
else:
q_target - R # # 实际的(状态-行为)值 (回合结束)
is_terminated = True
q_table.loc[S, A] += ALPHA * (q_target - q_predict) # q_table 更新
S = S_ # 探索者移动到下一个 state
update_env(S, episode, step_counter + 1) # 环境更新
step_counter += 1
return q_table
if __name__ == '__main__':
q_table = RL()
print('\r\nQ-table:\n')
print(q_table)
构建Sarsa类
"""
This part of code is the Q learning brain, which is a brain of the agent.
All decisions are made in here.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
'''
Sarsa:
Initialize Q(s, a) arbitrarily #Q(s, a):Q表 每个状态对应的动作的Q值
Repeat (for each episode):
Initialize S
Choose a from s using policy derived from Q (e.g.,E-greedy)
Repeat (for each step of episode):
Take action a, observe r, s'
Choose a' from s' using policy derived from Q (e.g.,E-greedy)
Q(s,a)←Q(s,a) +alpha[r + gamma*Q(s',a')- Q(s,a)]
s←s',a←a';
until S is terminal
'''
import numpy as np
import pandas as pd
class RL(object):
def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
self.actions = action_space # a list
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon = e_greedy
self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
def check_state_exist(self, state):
if state not in self.q_table.index:
# append new state to q table
self.q_table = self.q_table.append(
pd.Series(
[0]*len(self.actions),
index=self.q_table.columns,
name=state,
)
)
def check_state_exist2(self,state):
if state not in self.q_table.index:
to_be_append = pd.Series(
[0] * len(self.actions),
index = self.q_table.columns,
name = state,
)
self.q_table = self.q_table.append(to_be_append)
def choose_action(self, observation):
self.check_state_exist(observation)
# action selection
if np.random.rand() < self.epsilon:
# choose best action
state_action = self.q_table.loc[observation, :]
# some actions may have the same value, randomly choose on in these actions
action = np.random.choice(state_action[state_action == np.max(state_action)].index)
else:
# choose random action
action = np.random.choice(self.actions)
return action
def learn(self, *args):
pass
# off-policy
class QLearningTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
super(QLearningTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
def learn(self, s, a, r, s_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
# on-policy
class SarsaTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
def learn(self, s, a, r, s_, a_):
self.check_state_exist(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, a_] # next state is not terminal
else:
q_target = r # next state is terminal
self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update
class SarsaLambdaTable(RL):
def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
# 后向观测算法, eligibility trace.
self.lambda_ = trace_decay
self.eligibility_trace = self.q_table.copy() # 空的 eligibility trace 表
def learn(self, s, a, r, s_, a_):
# 这部分和 Sarsa 一样
self.check_state_exist2(s_)
q_predict = self.q_table.loc[s, a]
if s_ != 'terminal':
q_target = r + self.gamma * self.q_table.loc[s_, a_] # next state is not terminal
else:
q_target = r # next state is terminal
error = q_target - q_predict
# 这里开始不同:
# 对于经历过的 state-action, 我们让他+1, 证明他是得到 reward 路途中不可或缺的一环
self.eligibility_trace.loc[s, a] += 1
# 更有效的方式:
self.eligibility_trace.loc[s, :] *= 0
self.eligibility_trace.loc[s, a] = 1
# Q table 更新
self.q_table += self.lr * error * self.eligibility_trace
# 随着时间衰减 eligibility trace 的值, 离获取 reward 越远的步, 他的"不可或缺性"越小
self.eligibility_trace *= self.gamma*self.lambda_
运行Sarsa
from maze_env import Maze
from RL_brain import SarsaTable
def update():
for episode in range(100):
observation = env.reset()
action = RL.choose_action(str(observation))
while True:
env.render()
observation_, reward, done = env.step(action)
# RL choose action based on next observation
action_ = RL.choose_action(str(observation_))
# RL learn from this transition (s, a, r, s, a) ==> Sarsa
RL.learn(str(observation), action, reward, str(observation_), action_)
# swap observation and action
observation = observation_
action = action_
# break while loop when end of this episode
if done:
break
# end of game
print('game over')
env.destroy()
if __name__ == '__main__':
env = Maze()
RL = SarsaTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
Deep-Q-Learning-Network(DQN)
环境配置
"""
Reinforcement learning maze example.
Red rectangle: explorer.
Black rectangles: hells [reward = -1].
Yellow bin circle: paradise [reward = +1].
All other states: ground [reward = 0].
This script is the environment part of this example.
The RL is in RL_brain.py.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import time
import sys
if sys.version_info.major == 2:
import Tkinter as tk
else:
import tkinter as tk
UNIT = 40 # pixels
MAZE_H = 4 # grid height
MAZE_W = 4 # grid width
class Maze(tk.Tk, object):
def __init__(self):
super(Maze, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
self.n_features = 2
self.title('maze')
self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
self._build_maze()
def _build_maze(self):
self.canvas = tk.Canvas(self, bg='white',
height=MAZE_H * UNIT,
width=MAZE_W * UNIT)
# create grids
for c in range(0, MAZE_W * UNIT, UNIT):
x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
self.canvas.create_line(x0, y0, x1, y1)
for r in range(0, MAZE_H * UNIT, UNIT):
x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
self.canvas.create_line(x0, y0, x1, y1)
# create origin
origin = np.array([20, 20])
# hell
hell1_center = origin + np.array([UNIT * 2, UNIT])
self.hell1 = self.canvas.create_rectangle(
hell1_center[0] - 15, hell1_center[1] - 15,
hell1_center[0] + 15, hell1_center[1] + 15,
fill='black')
# hell
# hell2_center = origin + np.array([UNIT, UNIT * 2])
# self.hell2 = self.canvas.create_rectangle(
# hell2_center[0] - 15, hell2_center[1] - 15,
# hell2_center[0] + 15, hell2_center[1] + 15,
# fill='black')
# create oval
oval_center = origin + UNIT * 2
self.oval = self.canvas.create_oval(
oval_center[0] - 15, oval_center[1] - 15,
oval_center[0] + 15, oval_center[1] + 15,
fill='yellow')
# create red rect
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# pack all
self.canvas.pack()
def reset(self):
self.update()
time.sleep(0.1)
self.canvas.delete(self.rect)
origin = np.array([20, 20])
self.rect = self.canvas.create_rectangle(
origin[0] - 15, origin[1] - 15,
origin[0] + 15, origin[1] + 15,
fill='red')
# return observation
return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
def step(self, action):
s = self.canvas.coords(self.rect)
base_action = np.array([0, 0])
if action == 0: # up
if s[1] > UNIT:
base_action[1] -= UNIT
elif action == 1: # down
if s[1] < (MAZE_H - 1) * UNIT:
base_action[1] += UNIT
elif action == 2: # right
if s[0] < (MAZE_W - 1) * UNIT:
base_action[0] += UNIT
elif action == 3: # left
if s[0] > UNIT:
base_action[0] -= UNIT
self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent
next_coords = self.canvas.coords(self.rect) # next state
# reward function
if next_coords == self.canvas.coords(self.oval):
reward = 1
done = True
elif next_coords in [self.canvas.coords(self.hell1)]:
reward = -1
done = True
else:
reward = 0
done = False
s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
return s_, reward, done
def render(self):
# time.sleep(0.01)
self.update()
构建DQN类
'''
搭建神经网络
为了使用 Tensorflow 来实现 DQN, 比较推荐的方式是搭建两个神经网络,
target_net 用于预测q_target 值, 他不会及时更新参数.
eval_net 用于预测 q_eval, 这个神经网络拥有最新的神经网络参数.
不过这两个神经网络结构是完全一样的,
'''
import numpy as np
import pandas as pd
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
tf.compat.v1.disable_eager_execution()
np.random.seed(1)
tf.random.set_seed(1)
class DeepQNetwork:
'''
n_actions:神经网络输出actions的q值的个数
n_features:接收observation个数 例如 长宽高
learning_rate = 0.01: 学习率
reward_decay = 0.9: gamma值
e_greedy = 0.9:
replace_target_iter = 300:隔多少步更新target参数
memory_size = 500:记忆库容量 可以记多少数据
batch_size = 32:神经网络提升 梯度下降
e_greey_increment = None: 不断缩小学习范围
output_graph = False: 输出神经网络图
'''
def __init__(
self,
n_actions,
n_features,
learning_rate = 0.01,
reward_decay = 0.9,
e_greedy = 0.9,
replace_target_iter = 300,
memory_size = 500,
batch_size = 32,
e_greedy_increment = None,
output_graph = False,
):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
#学习总步数 epsilon根据步数提高 # 记录学习次数 (用于判断是否更换 target_net 参数)
self.learn_step_counter = 0
#初始化记忆库 ??? # 初始化全 0 记忆 [s, a, r, s_]
self.memory = np.zeros((self.memory_size, n_features*2+2))
#建立神经网络
self._build_net()
# 替换 target net 的参数
t_params = tf.get_collection('target_net_params') # 提取 target_net 的参数
e_params = tf.get_collection('eval_net_params') # 提取 eval_net 的参数
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] # 更新 target_net 参数
self.sess = tf.compat.v1.Session()
#输出 tensorboard 文件
if output_graph:
# $ tensorboard --logdir=logs
# tf.train.SummaryWriter soon be deprecated, use following
tf.compat.v1.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.compat.v1.global_variables_initializer()) #激活
self.cost_his = [] #记录误差 记录所有 cost 变化, 用于最后 plot 出来观看
def _build_net(self):
# -------------- 创建 eval 神经网络, 及时提升参数 --------------
self.s = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s') #用来接收 observation
self.q_target = tf.compat.v1.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # 用来接收 q_target 的值, 这个之后会通过计算得到
with tf.compat.v1.variable_scope('eval_net'):
#c_names(collections_names) 是在更新target_net 参数时用到
#每一层的默认参数
c_names, n_l1, w_initializer, b_initializer = ['eval_net_params',tf.compat.v1.GraphKeys.GLOBAL_VARIABLES],10,tf.compat.v1.random_normal_initializer(0., 0.3), tf.compat.v1.constant_initializer(0.1) # config of layers
'''
c_names: 集合形式,通过该变量调用参数
n_l1:第一层的神经元数
w_initializer、b_initializer:生成随机参数
'''
# eval_net 的第一层. collections 是在更新 target_net 参数时会用到
with tf.compat.v1.variable_scope('l1'):
w1 = tf.compat.v1.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer,collections=c_names)
b1 = tf.compat.v1.get_variable('b1', initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
# eval_net 的第二层. collections 是在更新 target_net 参数时会用到
with tf.compat.v1.variable_scope('l2'):
w2 = tf.compat.v1.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer,collections=c_names)
b2 = tf.compat.v1.get_variable('b2',[1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_eval = tf.matmul(l1, w2) + b2 # matmul 相乘 q_eval:为q值 有多少行为就有多少q值 为q估计
with tf.name_scope('loss'): #求误差
self.loss = tf.reduce_sum(tf.compat.v1.squared_difference(self.q_target, self.q_eval))
with tf.name_scope('train'):# 梯度下降
self._train_op = tf.compat.v1.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ---------------- 创建 target 神经网络, 提供 target Q ---------------------
self.s_ = tf.compat.v1.placeholder(tf.float32, [None, self.n_features], name='s_') # 接收下个 observation
with tf.compat.v1.variable_scope('target_net'):
# c_names(collections_names) 是在更新 target_net 参数时会用到
c_names = ['targe_net_params', tf.compat.v1.GraphKeys.GLOBAL_VARIABLES]
# target_net 的第一层. collections 是在更新 target_net 参数时会用到
with tf.compat.v1.variable_scope('l1'):
w1 = tf.compat.v1.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer,collections=c_names)
b1 = tf.compat.v1.get_variable('b1',[1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
# target_net 的第二层. collections 是在更新 target_net 参数时会用到
with tf.compat.v1.variable_scope('l2'):
w2 = tf.compat.v1.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer,collections=c_names)
b2 = tf.compat.v1.get_variable('b2',[1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_next = tf.matmul(l1, w2) + b2
#存储 transition 存储记忆
def store_transition(self, s, a, r, s_):
'''
s:当前的observation
a: actions
r: reward
s_: 下一步的observation
'''
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
# 记录一条 [s, a, r, s_] 记录
transition = np.hstack((s, [a, r], s_))
# 总 memory 大小是固定的, 如果超出总大小, 旧 memory 就被新 memory 替换
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition #替换过程
self.memory_counter += 1
#选取q值最大的动作
def choose_action(self, observation):
observation = observation[np.newaxis, :] #增加维度
if np.random.uniform() < self.epsilon:
# 让 eval_net 神经网络生成所有 action 的值, 并选择值最大的 action
actins_value = self.sess.run(self.q_eval, feed_dict = {self.s: observation})
action = np.argmax(actins_value)
else:
action = np.random.randint(0, self.n_actions) #随机选择
return action
#换参数
def _replace_target_params(self):
t_params = tf.compat.v1.get_collection('target_net_params')
e_params = tf.compat.v1.get_collection('eval_net_params')
self.sess.run([tf.compat.v1.assign(t, 3) for t, e in zip(t_params, e_params)])
#学习并更新参数
def learn(self):
# 检查是否替换 target_net 参数
if self.learn_step_counter % self.replace_target_iter == 0:
self._replace_target_params()
print('\ntarget_params_replaced\n')
# 从记忆库memory 中随机抽取 batch_size 个记忆
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size = self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size = self.batch_size)
batch_memory = self.memory[sample_index, :]
#获取 q_next(target_net 产生的q) 和 q_eval(eval_net 产生的q)
'''
q_next:接收后x个n_features
q_eval:接收前x个n_features
'''
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict = {
self.s_:batch_memory[:, -self.n_features:],
self.s: batch_memory[:, :self.n_features]
}
)
# 下面这几步十分重要. q_next, q_eval 包含所有 action 的值,
# 而我们需要的只是已经选择好的 action 的值, 其他的并不需要.
# 所以我们将其他的 action 值全变成 0, 将用到的 action 误差值 反向传递回去, 作为更新凭据.
# 这是我们最终要达到的样子, 比如 q_target - q_eval = [1, 0, 0] - [-1, 0, 0] = [2, 0, 0]
# q_eval = [-1, 0, 0] 表示这一个记忆中有我选用过 action 0, 而 action 0 带来的 Q(s, a0) = -1, 所以其他的 Q(s, a1) = Q(s, a2) = 0.
# q_target = [1, 0, 0] 表示这个记忆中的 r+gamma*maxQ(s_) = 1, 而且不管在 s_ 上我们取了哪个 action,
# 我们都需要对应上 q_eval 中的 action 位置, 所以就将 1 放在了 action 0 的位置.
# 下面也是为了达到上面说的目的, 不过为了更方面让程序运算, 达到目的的过程有点不同.
# 是将 q_eval 全部赋值给 q_target, 这时 q_target-q_eval 全为 0,
# 不过 我们再根据 batch_memory 当中的 action 这个 column 来给 q_target 中的对应的 memory-action 位置来修改赋值.
# 使新的赋值为 reward + gamma * maxQ(s_), 这样 q_target-q_eval 就可以变成我们所需的样子.
# 具体在下面还有一个举例说明.
"""
假如在这个 batch 中, 我们有2个提取的记忆, 根据每个记忆可以生产3个 action 的值:
q_eval =
[[1, 2, 3],
[4, 5, 6]]
q_target = q_eval =
[[1, 2, 3],
[4, 5, 6]]
然后根据 memory 当中的具体 action 位置来修改 q_target 对应 action 上的值:
比如在:
记忆 0 的 q_target 计算值是 -1, 而且我用了 action 0;
记忆 1 的 q_target 计算值是 -2, 而且我用了 action 2:
q_target =
[[-1, 2, 3],
[4, 5, -2]]
所以 (q_target - q_eval) 就变成了:
[[(-1)-(1), 0, 0],
[0, 0, (-2)-(6)]]
最后我们将这个 (q_target - q_eval) 当成误差, 反向传递会神经网络.
所有为 0 的 action 值是当时没有选择的 action, 之前有选择的 action 才有不为0的值.
我们只反向传递之前选择的 action 的值,
"""
q_target = q_eval.copy()
batch_index = np.arange(self.bath_size, dtype = np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
# 训练 eval_net
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict = {
self.s: batch_memory[:, self.n_features],
self.q_target: q_target
})
self.cost_his.append(self.cost)
# 逐渐增加 epsilon, 降低行为的随机性
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
#打印cost变化曲线
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()
运行DQN
from maze_env import Maze
from RL_brain import DeepQNetwork
def run_maze():
step = 0 # 用来控制什么时候学习
for episode in range(300):
#初始化环境
observation = env.reset()
while True:
# 刷新环境
env.render()
# DQN 根据观测值选择行为
action = RL.choose_action(observation)
# 环境根据行为给出下一个 state, reward, 是否终止
observation_, reward, done = env.step(action)
# DQN 存储记忆
RL.store_transition(observation, action, reward, observation_)
# 控制学习起始时间和频率 (先累积一些记忆再开始学习)
if (step > 200) and (step % 5 == 0):
RL.learn()
# 将下一个 state_ 变为 下次循环的 state
observation = observation_
# 如果终止, 就跳出循环
if done:
break
step += 1 # 总步数
# end of game
print('game over')
env.destroy()
if __name__ == "__main__":
env = Maze()
RL = DeepQNetwork(env.n_actions, env.n_features,
learning_rate = 0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200, # 每 200 步替换一次 target_net 的参数
memory_size=2000, # 记忆上限
# output_graph=True # 是否输出 tensorboard 文件
)
env.after(100, run_maze)
env.mainloop()
RL.plot_cost() # 观看神经网络的误差曲线
Gym模拟
CartPole-v0
import gym
from RL_brain import DeepQNetwork
env = gym.make('CartPole-v0') # 定义使用 gym 库中的那一个环境
env = env.unwrapped # 不做这个会有很多限制
print(env.action_space) # 查看这个环境中可用的 action 有多少个
print(env.observation_space) # 查看这个环境中可用的 state 的 observation 有多少个
print(env.observation_space.high) # 查看 observation 最高取值
print(env.observation_space.low) # 查看 observation 最低取值
# 定义使用 DQN 的算法
RL = DeepQNetwork(n_actions = env.action_space.n,
n_features = env.observation_space.shape[0],
learning_rate = 0.01,
e_greedy = 0.9,
replace_target_iter = 100,
memory_size = 2000,
e_greedy_increment = 0.0008,
)
total_steps = 0 # 记录步数
for i_episode in range(100):
#初始化环境
observation = env.reset()
ep_r = 0
while True:
env.render() # 刷新环境
action = RL.choose_action(observation) # 选行为
observation_, reward, done, info = env.step(action) # 获取下一个 state
x, x_dot, theta, theta_dot = observation_ # 细分开, 为了修改原配的 reward
# x 是车的水平位移, 所以 r1 是车越偏离中心, 分越少
# theta 是棒子离垂直的角度, 角度越大, 越不垂直. 所以 r2 是棒越垂直, 分越高
x, x_dot, theta, theta_dot = observation_
r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
reward = r1 + r2 # 总 reward 是 r1 和 r2 的结合, 既考虑位置, 也考虑角度, 这样 DQN 学习更有效率
# DQN 存储记忆
RL.store_transition(observation, action, reward, observation_)
# 控制学习起始时间和频率 (先累积一些记忆再开始学习)
if total_steps > 1000 :
RL.learn()
ep_r += reward
if done:
print('episode: ', i_episode,
'ep_r: ', round(ep_r, 2),
' epsilon: ', round(RL.epsilon, 2))
break
observation = observation_
total_steps += 1
RL.plot_cost()
MountainCar-v0
import gym
from RL_brain import DeepQNetwork
env = gym.make('MountainCar-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
RL = DeepQNetwork(n_actions=3,n_features=2,learning_rate=0.01,e_greedy=0.9,replace_target_iter=300,memory_size=3000,e_greedy_increment=0.0001)
total_step = 0
for i_episode in range(10):
observation = env.reset()
ep_r = 0
while True:
env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
position, velocity = observation_
reward = abs(position - (0.5))
RL.store_transition(observation, action, reward, observation_)
if total_step > 1000:
RL.learn()
ep_r += reward
if done:
get = '| Get' if observation_[0] >= env.unwrapped.goal_position else '| ----'
print('Epi:',i_episode,
get,
'| Ep_r:',round(ep_r, 4),
'|Epsilon:',round(RL.epsilon,2))
break
observation = observation_
total_step += 1
RL.plot_cost()
Taxi-v3
import gym
from RL_brain import QLearningTable
import matplotlib.pyplot as plt
import numpy as np
env = gym.make('Taxi-v3')
state = env.reset()
taxirow, taxicol, passloc, destidx = env.unwrapped.decode(state)
print('出租车位置 = {}'.format((taxirow, taxicol)))
print('乘客位置 = {}'.format((env.unwrapped.locs[passloc])))
print('目标位置 = {}'.format((env.unwrapped.locs[destidx])))
RL = QLearningTable(actions=list(range(env.action_space.n)))
total_reward = 0
episode_rewards = []
for episode in range(100):
observation = env.reset()
while True:
env.render()
action = RL.choose_action(str(observation))
observation_next, reward, done, info = env.step(action)
total_reward += reward
episode_rewards.append(total_reward)
RL.learn(str(observation),action, reward, str(observation_next))
observation = observation_next
if done:
break
plt.plot(episode_rewards)
plt.show()
# print('平均回合奖励 = {} / {} = {}'.format(sum(episode_rewards)),len(episode_rewards), np.mean(episode_rewards))
Policy Gradients
强化学习是一个通过奖惩来学习正确行为的机制. 家族中有很多种不一样的成员, 有学习奖惩值, 根据自己认为的高价值选行为, 比如 Q-learning, Deep Q Network。
也有不通过分析奖励值, 直接输出行为的方法, 即Policy Gradients,简单而言就是PG加上一个神经网络来输出预测的动作,可以在一个连区间内挑选动作。但是例如Q-learning, 它如果在无穷多的动作中计算价值, 从而选择行为, 会很慢。
总之,Policy gradient 是 RL 中另外一个大家族, 他不像 Value-based 方法 (Q learning, Sarsa), 但他也要接受环境信息 (observation), 不同的是他要输出不是 action 的 value, 而是具体的那一个 action, 这样 policy gradient 就跳过了 value 这个阶段. Policy gradient 最大的一个优势是: 输出的这个 action 可以是一个连续的值, 之前我们说到的 value-based 方法输出的都是不连续的值, 然后再选择值最大的 action. 而 policy gradient 可以在一个连续分布上选取 action。
更新过程,如图,通过观察信息选择了左侧的动作,网络想要进行反向传递,以便于下次被选中的可能性增加,但是由于奖惩机制的存在,会提示该行为是不好的,那么该动作的选择可能性降低。如果选择的动作是右侧的,神经网络想要进行反向传递, 使右边的行为下次被多选一点, 这时, 奖惩信息也来了, 告诉我们这是好行为, 那我们就在这次反向传递的时候加大力度, 让它下次被多选的幅度更大。
算法:
delta(log(Policy(s,a))V) 表示在 状态 s 对所选动作 a* 的吃惊度, 如果 Policy(s,a) 概率越小, 反向的 log(Policy(s,a)) (即 -log§) 反而越大. 如果在 Policy(s,a) 很小的情况下, 拿到了一个 大的 R, 也就是 大的 V, 那 -delta(log(Policy(s, a))*V) 就更大, 表示更吃惊, (我选了一个不常选的动作, 却发现原来它能得到了一个好的 reward, 那我就得对我这次的参数进行一个大幅修改)。
代码:
主循环代码:
import gym
from RL_brain import PolicyGradient
import matplotlib.pyplot as plt
RENDER = False # 在屏幕上显示模拟窗口会拖慢运行速度, 我们等计算机学得差不多了再显示模拟
DISPLAY_REWARD_THRESHOLD = 400 # 当 回合总 reward 大于 400 时显示模拟窗口
env = gym.make('CartPole-v0')
env = env.unwrapped # 取消限制
env.seed(1)# 普通的 Policy gradient 方法, 使得回合的 variance 比较大, 所以我们选了一个好点的随机种子
print(env.action_space) # 显示可用 action
print(env.observation_space) # 显示可用 state 的 observation
print(env.observation_space.high) # 显示 observation 最高值
print(env.observation_space.low) # 显示 observation 最低值
#定义神经网络
RL = PolicyGradient(
n_actions = env.action_space.n,
n_features = env.observation_space.shape[0],
learning_rate = 0.02,
reward_decay = 0.99, #gamma
output_graph=True,
)
#主循环
for i_episode in range(3000):
observation = env.reset()
while True:
if RENDER: env.render()
action = RL.choose_action(observation)
observation_, reward, done, info = env.step(action)
RL.store_transition(observation, action, reward)
if done:
ep_rs_sum = sum(RL.ep_rs)
if 'running_reward' not in globals():
running_reward = ep_rs_sum
else:
running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
if running_reward > DISPLAY_REWARD_THRESHOLD:RENDER = True # 判断是否显示模拟
print("episode:", i_episode, " reward:", int(running_reward))
vt = RL.learn() # 学习, 输出 vt
if i_episode == 0:
plt.plot(vt)
plt.xlabel('episode steps')
plt.ylabel('normalized state-action value')
plt.show()
break
observation = observation_
PL决策代码:
'''
搭建
Policy Gradient神经网络
'''
# import tensorflow.compat.v1 as tf
import numpy as np
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
class PolicyGradient:
def __init__(self, n_actions, n_features, learning_rate=0.01, reward_decay=0.95,output_graph=False):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay #reward 的递减率
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # 这是我们存储 回合信息的 list obs:观测值 as:使用的行动 rs:获得的奖励
self._build_net() #建立policy 网络
# self.sess = tf.Session()
self.sess = tf.compat.v1.Session()
if output_graph:
tf.compat.v1.summary.FileWriter("logs/",self.sess.graph)
self.sess.run(tf.compat.v1.global_variables_initializer())
def _build_net(self):
with tf.name_scope('inputs'):
self.tf_obs = tf.compat.v1.placeholder(tf.float32, [None,self.n_features], name='observation') #接收Observation
# 接收我们在这个回合中选过的 actions
self.tf_acts = tf.compat.v1.placeholder(tf.int32, [None, ], name='actions_num')
# 接收每个 state-action 所对应的 value (通过 reward 计算)
self.tf_vt = tf.compat.v1.placeholder(tf.float32, [None, ], name='actions_value')
#建立全连接层
layer = tf.compat.v1.layers.dense(
inputs = self.tf_obs,
units = 10, #神经元个数
activation = tf.nn.tanh, #激励函数
kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer = tf.constant_initializer(0.1),
name = 'fc1'
)
#第二层 输出所有action的值
all_act = tf.compat.v1.layers.dense(
inputs = layer,
units = self.n_actions, #输出个数 神经元
activation = None,# 之后再加 Softmax
kernel_initializer = tf.random_normal_initializer(mean=0, stddev=0.3),
bias_initializer = tf.constant_initializer(0.1),
name = 'fc2'
)
self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') #激励函数Softmax
#损失函数
with tf.name_scope('loss'):
# 最大化 总体 reward (log_p * R) 就是在最小化 -(log_p * R), 而 tf 的功能里只有最小化 loss
# neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts)
# 下面的方式是一样的 更能表示公式: 如果没有选的动作,one_hot为0
neg_log_prob = tf.reduce_sum(-tf.compat.v1.log(self.all_act_prob) * tf.one_hot(self.tf_acts, self.n_actions), axis=1) #记忆库中存放的是每个action 的值
loss = tf.reduce_mean(neg_log_prob * self.tf_vt )# (vt = 本reward + 衰减的未来reward) 引导参数的梯度下降
'''
tf.reduce_mean 函数用于计算张量tensor沿着指定的数轴(tensor的某一维度)上的的平均值,
主要用作降维或者计算tensor(图像)的平均值
reduce_mean(input_tensor,
axis=None,
keep_dims=False,
name=None,
reduction_indices=None)
第一个参数input_tensor: 输入的待降维的tensor;
第二个参数axis: 指定的轴,如果不指定,则计算所有元素的均值;
第三个参数keep_dims:是否降维度,设置为True,输出的结果保持输入tensor的形状,设置为False,输出结果会降低维度;
第四个参数name: 操作的名称;
'''
#利用AdamOptimizer 进行自适应学习优化
with tf.name_scope('train'):
#因为Adam算法中只有minimize 所以要将 tf.log加负号,才能得最大值
self.train_op = tf.compat.v1.train.AdamOptimizer(self.lr).minimize(loss)
'''
行为不再是用 Q value 来选定的, 而是用概率来选定. 即使不用 epsilon-greedy, 也具有一定的随机性.
'''
def choose_action(self, observation):
prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs:observation[np.newaxis, :]}) # 所有 action 的概率
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # 根据概率来选 action
return action
#存储回合 transition
'''
将这一步的 observation, action, reward 加到列表中去. 因为本回合完毕之后要清空列表,
然后存储下一回合的数据, 所以我们会在 learn() 当中进行清空列表的动作.
'''
def store_transition(self, s, a, r):
#清空init中的三个列表
self.ep_obs.append(s)
self.ep_as.append(a)
self.ep_rs.append(r)
#学习更新参数
'''
对这回合的所有 reward 动动手脚, 使他变得更适合被学习. 第一就是随着时间推进,
用 gamma 衰减未来的 reward, 然后为了一定程度上减小 policy gradient 回合 variance,
我们标准化回合的 state-action value 依据在 Andrej Karpathy 的 blog.
'''
def learn(self):
#衰减,并标准化该回合的reward
discounted_ep_rs_norm = self._discount_and_norm_rewards() # 功能再面
#在episode下训练
self.sess.run(self.train_op, feed_dict={
self.tf_obs : np.vstack(self.ep_obs), # shape=[None, n_obs]
self.tf_acts : np.array(self.ep_as), # shape=[None, ]
self.tf_vt : discounted_ep_rs_norm, # shape=[None,]
})
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # 清空回合 data
return discounted_ep_rs_norm # 返回这一回合的 state-action value
#实现对未来 reward 的衰减
def _discount_and_norm_rewards(self):
discounted_ep_rs = np.zeros_like(self.ep_rs)
running_add = 0
for t in reversed(range(0, len(self.ep_rs))):
running_add = running_add * self.gamma + self.ep_rs[t]
discounted_ep_rs[t] = running_add
discounted_ep_rs -= np.mean(discounted_ep_rs)
return discounted_ep_rs
:]}) # 所有 action 的概率
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # 根据概率来选 action
return action
#存储回合 transition
'''
将这一步的 observation, action, reward 加到列表中去. 因为本回合完毕之后要清空列表,
然后存储下一回合的数据, 所以我们会在 learn() 当中进行清空列表的动作.
'''
def store_transition(self, s, a, r):
#清空init中的三个列表
self.ep_obs.append(s)
self.ep_as.append(a)
self.ep_rs.append(r)
#学习更新参数
'''
对这回合的所有 reward 动动手脚, 使他变得更适合被学习. 第一就是随着时间推进,
用 gamma 衰减未来的 reward, 然后为了一定程度上减小 policy gradient 回合 variance,
我们标准化回合的 state-action value 依据在 Andrej Karpathy 的 blog.
'''
def learn(self):
#衰减,并标准化该回合的reward
discounted_ep_rs_norm = self._discount_and_norm_rewards() # 功能再面
#在episode下训练
self.sess.run(self.train_op, feed_dict={
self.tf_obs : np.vstack(self.ep_obs), # shape=[None, n_obs]
self.tf_acts : np.array(self.ep_as), # shape=[None, ]
self.tf_vt : discounted_ep_rs_norm, # shape=[None,]
})
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # 清空回合 data
return discounted_ep_rs_norm # 返回这一回合的 state-action value
#实现对未来 reward 的衰减
def _discount_and_norm_rewards(self):
discounted_ep_rs = np.zeros_like(self.ep_rs)
running_add = 0
for t in reversed(range(0, len(self.ep_rs))):
running_add = running_add * self.gamma + self.ep_rs[t]
discounted_ep_rs[t] = running_add
discounted_ep_rs -= np.mean(discounted_ep_rs)
return discounted_ep_rs