Q-learning的简单实现
效果就是
建立Q-table
import numpy as np import pandas as pd import time np.random.seed(2) N_STATES = 6 #多少种状态 也就是最开始的距离离宝藏的距离有多少步 ACTIONS = ['left','right'] #动作 EPSILON = 0.9 #greedy policy ALPHA = 0.1 #学习率 LAMBDA = 0.9 #未来奖励的衰减因子 MAX_EPISODES = 13 #最多只玩13回合 FRESH_TIME = 0.3 #0.3秒走一步 #建立Q-table def build_q_table(n_states, actions): table = pd.DataFrame( np.zeros((n_states, len(actions))), #Q-table初始值 columns=actions, ) print(table) return table build_q_table(N_STATES, ACTIONS)
完整的
import numpy as np import pandas as pd import time np.random.seed(2) N_STATES = 6 #多少种状态 也就是最开始的距离离宝藏的距离有多少步 ACTIONS = ['left','right'] #动作 EPSILON = 0.9 #greedy policy ALPHA = 0.1 #学习率 LAMBDA = 0.9 #未来奖励的衰减因子 MAX_EPISODES = 13 #最多只玩13回合 FRESH_TIME = 0.1 #0.1秒走一步 #建立Q-table def build_q_table(n_states, actions): table = pd.DataFrame( np.zeros((n_states, len(actions))), #Q-table初始值 columns=actions, ) print(table) return table #如何选择动作 def choose_action(state, q_table): state_actions = q_table.iloc[state, :] # print('-------------') # print(state_actions) # print('------------') if (np.random.uniform() > EPSILON) or (state_actions.all() == 0): # 10%的情况或者是初始化的时候,随机选择action action_name = np.random.choice(ACTIONS) else: # 90%的情况选择使得q值最大的action action_name = state_actions.idxmax() return action_name #环境对我们行为的反馈 def get_env_feedback(S,A): #s_为下一个状态,R为reward if A == 'right': if S == N_STATES - 2: #再往右走就到终点了 S_ = 'terminal' R = 1 else: S_ = S+1 R = 0 else: # A='left R = 0 if S == 0: S_ =S #当位于最左边的时候,就不能往左走了,待在原地 else: S_ =S-1 return S_,R #建立环境 def update_env(S, episode, step_counter): # 更新环境,到达最右端就是胜利,在一维的环境移动 env_list = ['-'] * (N_STATES - 1) + ['T'] # '-----T'就是环境 if S == 'terminal': interaction = 'Episode %s: total_steps = %s' % (episode + 1, step_counter) print('\r{}'.format(interaction), end='') time.sleep(2) print('\r ', end='') else: env_list[S] = 'o' interaction = ''.join(env_list) print('\r{}'.format(interaction), end='') time.sleep(FRESH_TIME) #主循环 def rl(): #reinforcement learning q_table = build_q_table(N_STATES, ACTIONS) for episode in range(MAX_EPISODES): step_counter = 0 #初始化状态 S = 0 #初始state. S的值就是数字,1,2,3,4...... is_terminated = False #是否是终止符,是的话就结束这一回合 #首先更新环境 update_env(S,episode,step_counter) while not is_terminated: #选择action A = choose_action(S, q_table) #A就是left或者right S_, R =get_env_feedback(S,A) #take action & 得到下一个state和reward # q的估计值 q_predict = q_table.loc[S,A] if S_ != 'terminal': #q_target就是q的现实值 q_target = R + LAMBDA * q_table.iloc[S_,:].max() else: q_target = R is_terminated = True # 更新q table q_table.loc[S,A] += ALPHA * (q_target - q_predict) S = S_ #move to next state update_env(S, episode, step_counter+1) step_counter +=1 print(q_table) return q_table if __name__ == "__main__": q_table = rl() print('\r\nQ-table:\n') print(q_table)
参考:
莫烦Python