Q-Learning算法
例一
以最短距离到达终点
import numpy as np
import pandas as pd
import time
np.random.seed(2) # 生成相同的随机数
N_STATES = 6 # 距离
ACTIONS = ['left', 'right'] # 可进行的操作
EPSILON = 0.9 # 选择最优值概率
ALPHA = 0.1 # 学习效率
GAMMA = 0.9 # 衰减度
MAX_EPISODES = 13 # 最大回合次数
FRESH_TIME = 0.3 # 速度
def build_q_table(n_states, actions):
#创建去q_table列表
table = pd.DataFrame(
np.zeros((n_states, len(actions))), # q_table 原始值
columns=actions, # 列索引名称 行所以为默认值
)
return table
def choose_action(state, q_table):
#创建选择动作功能
state_actions = q_table.iloc[state, :]
if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):
action_name = np.random.choice(ACTIONS)
#初始时以及10%随机选择动作
else:
action_name = state_actions.idxmax()
#90%选择最优值动作
return action_name
def get_env_feedback(S, A):
# 环境交互
if A == 'right': #向右移动
if S == N_STATES - 2: # 终点
S_ = 'terminal'
R = 1#奖励值
else:
S_ = S + 1
R = 0
else: # 向左移动
R = 0
if S == 0:
S_ = S
else:
S_ = S - 1
return S_, R
def update_env(S, episode, step_counter):
# 创建环境
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T'
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
print('\r{}'.format(interaction), end='')
time.sleep(2)
print('\r ', end='')
else:
env_list[S] = 'o'
interaction = ''.join(env_list)