作为一个非专业初学爱好者,在看了一些强化学习教程之后决定从Q-table入门强化学习。我参考的资料很多,个人感觉下边这个链接https://mp.weixin.qq.com/s/34E1tEQMZuaxvZA66_HRwA讲的不错。之前接触过Q-table的简单理论,但是一直没有实践一下,一写代码才发现很多问题其实自己没有考虑清楚。现在附上一份刚写不久的Q-table代码。详细原理不再多说,代码中不懂的具体可以看注释。
import numpy as np
from time import sleep
# Q_table的更新顺序其实是倒着更新,离终点越近的会先更新,然后由更新公式一点一点将接近初始点的Q值更新
class Q_table():
def __init__(self):
self.table = np.zeros([4, 7, 10]) # [X,_,_] X=0:上 X=1:右 X=2:下 X=3:左
self.table[0, 0, :] = -99 # 超出边界的动作奖励设置很小
self.table[1, :, 9] = -99
self.table[2, 6, :] = -99
self.table[3, :, 0] = -99
self.offset = 0 # 测试奖励值在不同范围的情况时使用的,不是必要使用
self.Reward = np.array([ # 奖励值的设定很重要!!!!
[0, 0, 0, 0, -1, 0, 0, 0, 0, 0],
[0, -1, -1, 0, 0, 0, 0, 0, -1, 0],
[0, 0, 0, -1, 0, 0, -1, 0, 0, 0],
[-1, 0, 0, 0, 0, -1, 0, 0, 0, 0],
[0, -1, 0, -1, -1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, -1, 1, -1, 0, 0], # 目的是从左上角走到这行的1的位置,-1的位置不能走
[0, 0, 0, -1, 0, 0, 0, 0, 0, 0],
])-self.offset
print(self.Reward)
self.cur_y, self.cur_x = 0, 0 # 当前坐标
self.lr = 0.8 # 学习率
self.discount = 0.8 # 折扣率
def update_Q_table(self, pos_y, pos_x, action): # 更新Q表的方法,输入当前位置坐标、要采取的动作。
self.cur_y, self.cur_x = pos_y, pos_x
next_y, next_x = self.cur_y, self.cur_x # 初始化next_y和next_x
update_flag = False
# 选取对应的action
if action == 0:
if self.cur_y > 0: # 保证next_y不会超出数组范围,否则无效
next_y, next_x = self.cur_y - 1, self.cur_x
update_flag = True
elif action == 1:
if self.cur_x < 9:# 保证next_x不会超出数组范围,否则无效
next_y, next_x = self.cur_y, self.cur_x + 1
update_flag = True
elif action == 2:
if self.cur_y < 6:
next_y, next_x = self.cur_y + 1, self.cur_x
update_flag = True
elif action == 3:
if self.cur_x > 0:
next_y, next_x = self.cur_y, self.cur_x - 1
update_flag = True
# 如果采取的action有效的话
if update_flag == True:
# 假设原来是在(x0,y0)(本程序(x0,y0)就是(cur_x,cur_y)),
# 执行action后是在(x1,y1)(本程序(x1,y1)就是(next_x,next_y),
# 就把(x0,y0)执行action后即在(x1,y1)处的4个可能的action的Q值保存
next_pos_all_Q_actions_list = []
# if next_y >= 0: # 保证(x1,y1)的4个可能的action值不会越界,不用写了。
# next_pos_all_Q_actions_list.append(self.table[0, next_y, next_x])
# else:
# next_pos_all_Q_actions_list.append(-99)
# if next_x <= 9:
# next_pos_all_Q_actions_list.append(self.table[1, next_y, next_x])
# else:
# next_pos_all_Q_actions_list.append(-99)
# if next_y <= 6:
# next_pos_all_Q_actions_list.append(self.table[2, next_y, next_x])
# else:
# next_pos_all_Q_actions_list.append(-99)
# if next_y >= 0:
# next_pos_all_Q_actions_list.append(self.table[3, next_y, next_x])
# else:
# next_pos_all_Q_actions_list.append(-99)
for i in range(4):
next_pos_all_Q_actions_list.append(self.table[i, next_y, next_x]) # 在(x1,y1)处的4个可采取动作的Q值
next_pos_all_Q_actions = np.array(next_pos_all_Q_actions_list)
max_next_pos_Q_val = np.max(next_pos_all_Q_actions) # 找那4个Q值的最大值
next_action = np.argmax(next_pos_all_Q_actions) # 找那4个Q值的最大值的位置,就得到了相应的action
delta_Q = self.Reward[next_y, next_x] + self.discount * max_next_pos_Q_val \
- self.table[action, self.cur_y, self.cur_x]
self.table[action, self.cur_y, self.cur_x] += self.lr * delta_Q # 更新Q
def show_actions(self): # 演示执行动作
pos_y, pos_x = 0, 0
whole_map = np.zeros([7, 10])
whole_map[pos_y, pos_x] = 1
for i in range(20):
next_action = np.argmax([self.table[0, pos_y, pos_x],
self.table[1, pos_y, pos_x],
self.table[2, pos_y, pos_x],
self.table[3, pos_y, pos_x]])
if next_action == 0:
pos_y -= 1
elif next_action == 1:
pos_x += 1
elif next_action == 2:
pos_y += 1
elif next_action == 3:
pos_x -= 1
# whole_map = np.zeros([7, 10])
whole_map[pos_y, pos_x] = 1
print('='*40)
print(whole_map)
sleep(0.5)
if self.Reward[pos_y,pos_x]==1-self.offset:
break
qtable = Q_table()
n = 0
while True:
for x in range(10):
for y in range(7):
for a in range(4):
qtable.update_Q_table(y, x, a)
n += 1
if n == 200: # 训练200次
break
print(qtable.table)
qtable.show_actions()
只要是安装了numpy就能运行代码,以下是运行的结果:
[[ 0 0 0 0 -1 0 0 0 0 0]
[ 0 -1 -1 0 0 0 0 0 -1 0]
[ 0 0 0 -1 0 0 -1 0 0 0]
[-1 0 0 0 0 -1 0 0 0 0]
[ 0 -1 0 -1 -1 0 0 0 0 0]
[ 0 0 0 0 0 -1 1 -1 0 0]
[ 0 0 0 -1 0 0 0 0 0 0]]
[[[-99. -99. -99. -99. -99.
-99. -99. -99. -99. -99. ]
[ 0.15270995 0.19088744 0.23860929 0.29826162 -0.62717298
0.46603378 0.58254222 0.72817778 0.58254222 0.46603378]
[ 0.19088744 -0.76139071 -0.70173838 0.37282702 0.46603378
0.58254222 0.72817778 0.91022222 -0.27182222 0.58254222]
[ 0.23860929 0.29826162 0.37282702 -0.70173838 0.37282702
0.46603378 0.42222222 1.13777778 0.91022222 0.72817778]
[ -0.70173838 0.37282702 0.46603378 0.37282702 0.33777778
0.42222222 1.77777778 1.42222222 1.13777778 0.91022222]
[ 0.37282702 -0.53396622 0.58254222 -0.27182222 0.42222222
1.77777778 2.22222222 1.77777778 1.42222222 1.13777778]
[ 0.46603378 0.58254222 0.72817778 0.91022222 1.13777778
1.22222222 2.77777778 1.22222222 1.13777778 0.91022222]]
[[ 0.19088744 0.23860929 0.29826162 -0.62717298 0.46603378
0.58254222 0.72817778 0.58254222 0.46603378 -99. ]
[ -0.76139071 -0.70173838 0.37282702 0.46603378 0.58254222
0.72817778 0.91022222 -0.27182222 0.58254222 -99. ]
[ 0.29826162 0.37282702 -0.70173838 0.37282702 0.46603378
0.42222222 1.13777778 0.91022222 0.72817778 -99. ]
[ 0.37282702 0.46603378 0.37282702 0.33777778 0.42222222
1.77777778 1.42222222 1.13777778 0.91022222 -99. ]
[ -0.53396622 0.58254222 -0.27182222 0.42222222 1.77777778
2.22222222 1.77777778 1.42222222 1.13777778 -99. ]
[ 0.58254222 0.72817778 0.91022222 1.13777778 1.22222222
2.77777778 1.22222222 1.13777778 0.91022222 -99. ]
[ 0.46603378 0.58254222 0.13777778 1.42222222 1.77777778
2.22222222 1.77777778 1.42222222 1.13777778 -99. ]]
[[ 0.19088744 -0.76139071 -0.70173838 0.37282702 0.46603378
0.58254222 0.72817778 0.91022222 -0.27182222 0.58254222]
[ 0.23860929 0.29826162 0.37282702 -0.70173838 0.37282702
0.46603378 0.42222222 1.13777778 0.91022222 0.72817778]
[ -0.70173838 0.37282702 0.46603378 0.37282702 0.33777778
0.42222222 1.77777778 1.42222222 1.13777778 0.91022222]
[ 0.37282702 -0.53396622 0.58254222 -0.27182222 0.42222222
1.77777778 2.22222222 1.77777778 1.42222222 1.13777778]
[ 0.46603378 0.58254222 0.72817778 0.91022222 1.13777778
1.22222222 2.77777778 1.22222222 1.13777778 0.91022222]
[ 0.37282702 0.46603378 0.58254222 0.13777778 1.42222222
1.77777778 2.22222222 1.77777778 1.42222222 1.13777778]
[-99. -99. -99. -99. -99.
-99. -99. -99. -99. -99. ]]
[[-99. 0.15270995 0.19088744 0.23860929 0.29826162
-0.62717298 0.46603378 0.58254222 0.72817778 0.58254222]
[-99. 0.19088744 -0.76139071 -0.70173838 0.37282702
0.46603378 0.58254222 0.72817778 0.91022222 -0.27182222]
[-99. 0.23860929 0.29826162 0.37282702 -0.70173838
0.37282702 0.46603378 0.42222222 1.13777778 0.91022222]
[-99. -0.70173838 0.37282702 0.46603378 0.37282702
0.33777778 0.42222222 1.77777778 1.42222222 1.13777778]
[-99. 0.37282702 -0.53396622 0.58254222 -0.27182222
0.42222222 1.77777778 2.22222222 1.77777778 1.42222222]
[-99. 0.46603378 0.58254222 0.72817778 0.91022222
1.13777778 1.22222222 2.77777778 1.22222222 1.13777778]
[-99. 0.37282702 0.46603378 0.58254222 0.13777778
1.42222222 1.77777778 2.22222222 1.77777778 1.42222222]]]
========================================
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]