强化学习中的一种 values-based 算法,最终应是会学出一个收敛的表格 Q-Table。
在Q-Learning的学习中,我们用Q(S, A)来表示value function.之前我们只讨论了有限个状态(state)和行动(action)的情况,这种情况下,我们其实等价于在不断维护一个Q-table,不断更新,直至其收敛。
实例
1.导入工具包
import numpy as np
import pandas as pd
import time
# 设置随机种子,保障复现
np.random.seed(2)
N_STATES = 6 # environment
ACTIONS = ['LEFT','RIGHT'] # action
EPSILON = 0.9 # greedy police
ALPHA = 0.1 # 学习率
LAMBDA = 0.9 # 折扣因子
MAX_EPISODES = 13 # 最大回合数
FRESH_TIME = 0.003 # 更新频率
2.定义Q_TABLE
def build_q_table(n_states,actions):
table = pd.DataFrame(
np.zeros((n_states,len(actions))),
columns = actions,
)
print(table)
return table
build_q_table(N_STATES,ACTIONS)
3.动作选择
def choose_action(state,q_table):
state_actions = q_table.iloc[state,:]
if (np.random.uniform() > EPSILON) or (state_actions.all()==0):
action_name = np.random.choice(ACTIONS)
else:
action_name = ACTIONS[state_actions.argmax()]
return action_name
4.确定环境反馈
# 这里在只有到达最右点,才会给予奖励
def get_env_feedback(S,A):
if A == 'RIGHT': # 向右移动
if S == N_STATES - 2: # 到达最右点前一点
S_ = 'TERMINAL' # 回合结束
R = 1 # 奖励+1
else:
S_ = S + 1 # 没有结束,就向右+1
R = 0 # 奖励为0
else:
R = 0 # 向左移动,奖励为0
if S == 0: # 如果到最左点
S_ = S
else:
S_ = S-1 # 否则向左-1
return S_,R
5.创建环境
def update_env(S,episode,step_counter):
env_list = ['_']*(N_STATES-1) + ['GOAL'] # env
if S == 'TERMINAL':
interaction = 'Episode %s : total_steps = %s' % (episode+1,step_counter)
print('\r{}'.format(interaction),end='')
time.sleep(FRESH_TIME)
print('\r ',end='')
else:
env_list[S] = '*'
interaction = ''.join(env_list)
print('\r{}'.format(interaction),end='')
time.sleep(FRESH_TIME)
def main():
q_table = build_q_table(N_STATES,ACTIONS)
for episode in range(MAX_EPISODES):
step_counter = 0
S = 0 # 每一回合重置状态,为最左点
is_terminated = False
update_env(S,episode,step_counter)
while not is_terminated:
A = choose_action(S,q_table)
S_,R = get_env_feedback(S,A)
q_predict = q_table.loc[S,A]
if S_!= 'TERMINAL':
q_target = R + LAMBDA*q_table.iloc[S_,:].max()
else:
q_target = R
is_terminated =True
q_table.loc[S,A] += ALPHA*(q_target - q_predict)
S = S_
update_env(S,episode,step_counter+1)
step_counter += 1
return q_table
if __name__ == "__main__":
q_table = main()
print(q_table)
综合起来:
# coding: utf-8
# ## 1.导入工具包
import numpy as np
import pandas as pd
import time
# 设置随机种子,保障复现
np.random.seed(2)
N_STATES = 6 # environment
ACTIONS = ['LEFT','RIGHT'] # action
EPSILON = 0.9 # greedy police
ALPHA = 0.1 # 学习率
LAMBDA = 0.9 # 折扣因子
MAX_EPISODES = 13 # 最大回合数
FRESH_TIME = 0.001 # 更新频率
# ## 2.定义Q_TABLE
def build_q_table(n_states,actions):
table = pd.DataFrame(
np.zeros((n_states,len(actions))),
columns = actions,
)
print(table)
return table
build_q_table(N_STATES,ACTIONS)
# ## 3.动作选择
def choose_action(state,q_table):
state_actions = q_table.iloc[state,:]
if (np.random.uniform() > EPSILON) or (state_actions.all()==0):
action_name = np.random.choice(ACTIONS)
else:
action_name = ACTIONS[state_actions.argmax()]
return action_name
# ## 4.确定环境反馈
# 这里在只有到达最右点,才会给予奖励
def get_env_feedback(S,A):
if A == 'RIGHT': # 向右移动
if S == N_STATES - 2: # 到达最右点前一点
S_ = 'TERMINAL' # 回合结束
R = 1 # 奖励+1
else:
S_ = S + 1 # 没有结束,就向右+1
R = 0 # 奖励为0
else:
R = 0 # 向左移动,奖励为0
if S == 0: # 如果到最左点
S_ = S
else:
S_ = S-1 # 否则向左-1
return S_,R
# ## 5.创建环境
def update_env(S,episode,step_counter):
env_list = ['_']*(N_STATES-1) + ['GOAL'] # env
if S == 'TERMINAL':
interaction = 'Episode %s : total_steps = %s' % (episode+1,step_counter)
print('\r{}'.format(interaction),end='')
time.sleep(FRESH_TIME)
print('\r ',end='')
else:
env_list[S] = '*'
interaction = ''.join(env_list)
print('\r{}'.format(interaction),end='')
time.sleep(FRESH_TIME)
def main():
q_table = build_q_table(N_STATES,ACTIONS)
for episode in range(MAX_EPISODES):
print('episode:', episode)
step_counter = 0
S = 0 # 每一回合重置状态,为最左点
is_terminated = False
update_env(S,episode,step_counter)
while not is_terminated:
A = choose_action(S,q_table)
S_,R = get_env_feedback(S,A)
q_predict = q_table.loc[S,A]
if S_!= 'TERMINAL':
q_target = R + LAMBDA*q_table.iloc[S_,:].max()
else:
q_target = R
is_terminated =True
q_table.loc[S,A] += ALPHA*(q_target - q_predict)
S = S_
update_env(S,episode,step_counter+1)
step_counter += 1
return q_table
if __name__ == "__main__":
q_table = main()
print(q_table)