强化学习初探索——Q-table 举例，附可执行Python代码

最新推荐文章于 2024-08-11 11:22:14 发布

足各小兑

最新推荐文章于 2024-08-11 11:22:14 发布

阅读量6.3k

点赞数 8

分类专栏：强化学习文章标签： Q-table

本文链接：https://blog.csdn.net/zugexiaodui/article/details/84611454

版权

强化学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

作为一个非专业初学爱好者，在看了一些强化学习教程之后决定从Q-table入门强化学习。我参考的资料很多，个人感觉下边这个链接https://mp.weixin.qq.com/s/34E1tEQMZuaxvZA66_HRwA讲的不错。之前接触过Q-table的简单理论，但是一直没有实践一下，一写代码才发现很多问题其实自己没有考虑清楚。现在附上一份刚写不久的Q-table代码。详细原理不再多说，代码中不懂的具体可以看注释。

import numpy as np
from time import sleep

# Q_table的更新顺序其实是倒着更新，离终点越近的会先更新，然后由更新公式一点一点将接近初始点的Q值更新

class Q_table():
    def __init__(self):
        self.table = np.zeros([4, 7, 10])  # [X,_,_] X=0:上  X=1:右  X=2:下  X=3:左
        self.table[0, 0, :] = -99 # 超出边界的动作奖励设置很小
        self.table[1, :, 9] = -99
        self.table[2, 6, :] = -99
        self.table[3, :, 0] = -99
        self.offset = 0 # 测试奖励值在不同范围的情况时使用的，不是必要使用
        self.Reward = np.array([ # 奖励值的设定很重要！！！！
            [0, 0, 0, 0, -1, 0, 0, 0, 0, 0],
            [0, -1, -1, 0, 0, 0, 0, 0, -1, 0],
            [0, 0, 0, -1, 0, 0, -1, 0, 0, 0],
            [-1, 0, 0, 0, 0, -1, 0, 0, 0, 0],
            [0, -1, 0, -1, -1, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, -1, 1, -1, 0, 0], # 目的是从左上角走到这行的1的位置，-1的位置不能走
            [0, 0, 0, -1, 0, 0, 0, 0, 0, 0],
        ])-self.offset
        print(self.Reward)
        self.cur_y, self.cur_x = 0, 0 # 当前坐标
        self.lr = 0.8 # 学习率
        self.discount = 0.8 # 折扣率

    def update_Q_table(self, pos_y, pos_x, action): # 更新Q表的方法，输入当前位置坐标、要采取的动作。
        self.cur_y, self.cur_x = pos_y, pos_x
        next_y, next_x = self.cur_y, self.cur_x # 初始化next_y和next_x
        update_flag = False
        # 选取对应的action
        if action == 0:
            if self.cur_y > 0: # 保证next_y不会超出数组范围，否则无效
                next_y, next_x = self.cur_y - 1, self.cur_x
                update_flag = True
        elif action == 1:
            if self.cur_x < 9:# 保证next_x不会超出数组范围，否则无效
                next_y, next_x = self.cur_y, self.cur_x + 1
                update_flag = True
        elif action == 2:
            if self.cur_y < 6:
                next_y, next_x = self.cur_y + 1, self.cur_x
                update_flag = True
        elif action == 3:
            if self.cur_x > 0:
                next_y, next_x = self.cur_y, self.cur_x - 1
                update_flag = True
        # 如果采取的action有效的话
        if update_flag == True:
            # 假设原来是在(x0,y0)(本程序(x0,y0)就是(cur_x,cur_y))，
            # 执行action后是在(x1,y1)(本程序(x1,y1)就是(next_x,next_y)，
            # 就把(x0,y0)执行action后即在(x1,y1)处的4个可能的action的Q值保存
            next_pos_all_Q_actions_list = []
            # if next_y >= 0: # 保证(x1,y1)的4个可能的action值不会越界,不用写了。
            #     next_pos_all_Q_actions_list.append(self.table[0, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            # if next_x <= 9:
            #     next_pos_all_Q_actions_list.append(self.table[1, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            # if next_y <= 6:
            #     next_pos_all_Q_actions_list.append(self.table[2, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            # if next_y >= 0:
            #     next_pos_all_Q_actions_list.append(self.table[3, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            for i in range(4):
                next_pos_all_Q_actions_list.append(self.table[i, next_y, next_x]) # 在(x1,y1)处的4个可采取动作的Q值
            next_pos_all_Q_actions = np.array(next_pos_all_Q_actions_list)
            max_next_pos_Q_val = np.max(next_pos_all_Q_actions) # 找那4个Q值的最大值
            next_action = np.argmax(next_pos_all_Q_actions) # 找那4个Q值的最大值的位置，就得到了相应的action
            delta_Q = self.Reward[next_y, next_x] + self.discount * max_next_pos_Q_val \
                      - self.table[action, self.cur_y, self.cur_x]
            self.table[action, self.cur_y, self.cur_x] += self.lr * delta_Q # 更新Q

    def show_actions(self): # 演示执行动作
        pos_y, pos_x = 0, 0
        whole_map = np.zeros([7, 10])
        whole_map[pos_y, pos_x] = 1
        for i in range(20):
            next_action = np.argmax([self.table[0, pos_y, pos_x],
                                     self.table[1, pos_y, pos_x],
                                     self.table[2, pos_y, pos_x],
                                     self.table[3, pos_y, pos_x]])
            if next_action == 0:
                pos_y -= 1
            elif next_action == 1:
                pos_x += 1
            elif next_action == 2:
                pos_y += 1
            elif next_action == 3:
                pos_x -= 1
            # whole_map = np.zeros([7, 10])
            whole_map[pos_y, pos_x] = 1
            print('='*40)
            print(whole_map)
            sleep(0.5)
            if self.Reward[pos_y,pos_x]==1-self.offset:
                break


qtable = Q_table()
n = 0
while True:
    for x in range(10):
        for y in range(7):
            for a in range(4):
                qtable.update_Q_table(y, x, a)
    n += 1
    if n == 200: # 训练200次
        break
print(qtable.table)
qtable.show_actions()

只要是安装了numpy就能运行代码，以下是运行的结果：

[[ 0  0  0  0 -1  0  0  0  0  0]
 [ 0 -1 -1  0  0  0  0  0 -1  0]
 [ 0  0  0 -1  0  0 -1  0  0  0]
 [-1  0  0  0  0 -1  0  0  0  0]
 [ 0 -1  0 -1 -1  0  0  0  0  0]
 [ 0  0  0  0  0 -1  1 -1  0  0]
 [ 0  0  0 -1  0  0  0  0  0  0]]
[[[-99.         -99.         -99.         -99.         -99.
   -99.         -99.         -99.         -99.         -99.        ]
  [  0.15270995   0.19088744   0.23860929   0.29826162  -0.62717298
     0.46603378   0.58254222   0.72817778   0.58254222   0.46603378]
  [  0.19088744  -0.76139071  -0.70173838   0.37282702   0.46603378
     0.58254222   0.72817778   0.91022222  -0.27182222   0.58254222]
  [  0.23860929   0.29826162   0.37282702  -0.70173838   0.37282702
     0.46603378   0.42222222   1.13777778   0.91022222   0.72817778]
  [ -0.70173838   0.37282702   0.46603378   0.37282702   0.33777778
     0.42222222   1.77777778   1.42222222   1.13777778   0.91022222]
  [  0.37282702  -0.53396622   0.58254222  -0.27182222   0.42222222
     1.77777778   2.22222222   1.77777778   1.42222222   1.13777778]
  [  0.46603378   0.58254222   0.72817778   0.91022222   1.13777778
     1.22222222   2.77777778   1.22222222   1.13777778   0.91022222]]

 [[  0.19088744   0.23860929   0.29826162  -0.62717298   0.46603378
     0.58254222   0.72817778   0.58254222   0.46603378 -99.        ]
  [ -0.76139071  -0.70173838   0.37282702   0.46603378   0.58254222
     0.72817778   0.91022222  -0.27182222   0.58254222 -99.        ]
  [  0.29826162   0.37282702  -0.70173838   0.37282702   0.46603378
     0.42222222   1.13777778   0.91022222   0.72817778 -99.        ]
  [  0.37282702   0.46603378   0.37282702   0.33777778   0.42222222
     1.77777778   1.42222222   1.13777778   0.91022222 -99.        ]
  [ -0.53396622   0.58254222  -0.27182222   0.42222222   1.77777778
     2.22222222   1.77777778   1.42222222   1.13777778 -99.        ]
  [  0.58254222   0.72817778   0.91022222   1.13777778   1.22222222
     2.77777778   1.22222222   1.13777778   0.91022222 -99.        ]
  [  0.46603378   0.58254222   0.13777778   1.42222222   1.77777778
     2.22222222   1.77777778   1.42222222   1.13777778 -99.        ]]

 [[  0.19088744  -0.76139071  -0.70173838   0.37282702   0.46603378
     0.58254222   0.72817778   0.91022222  -0.27182222   0.58254222]
  [  0.23860929   0.29826162   0.37282702  -0.70173838   0.37282702
     0.46603378   0.42222222   1.13777778   0.91022222   0.72817778]
  [ -0.70173838   0.37282702   0.46603378   0.37282702   0.33777778
     0.42222222   1.77777778   1.42222222   1.13777778   0.91022222]
  [  0.37282702  -0.53396622   0.58254222  -0.27182222   0.42222222
     1.77777778   2.22222222   1.77777778   1.42222222   1.13777778]
  [  0.46603378   0.58254222   0.72817778   0.91022222   1.13777778
     1.22222222   2.77777778   1.22222222   1.13777778   0.91022222]
  [  0.37282702   0.46603378   0.58254222   0.13777778   1.42222222
     1.77777778   2.22222222   1.77777778   1.42222222   1.13777778]
  [-99.         -99.         -99.         -99.         -99.
   -99.         -99.         -99.         -99.         -99.        ]]

 [[-99.           0.15270995   0.19088744   0.23860929   0.29826162
    -0.62717298   0.46603378   0.58254222   0.72817778   0.58254222]
  [-99.           0.19088744  -0.76139071  -0.70173838   0.37282702
     0.46603378   0.58254222   0.72817778   0.91022222  -0.27182222]
  [-99.           0.23860929   0.29826162   0.37282702  -0.70173838
     0.37282702   0.46603378   0.42222222   1.13777778   0.91022222]
  [-99.          -0.70173838   0.37282702   0.46603378   0.37282702
     0.33777778   0.42222222   1.77777778   1.42222222   1.13777778]
  [-99.           0.37282702  -0.53396622   0.58254222  -0.27182222
     0.42222222   1.77777778   2.22222222   1.77777778   1.42222222]
  [-99.           0.46603378   0.58254222   0.72817778   0.91022222
     1.13777778   1.22222222   2.77777778   1.22222222   1.13777778]
  [-99.           0.37282702   0.46603378   0.58254222   0.13777778
     1.42222222   1.77777778   2.22222222   1.77777778   1.42222222]]]
========================================
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]