强化学习初探索——Q-table 举例,附可执行Python代码

        作为一个非专业初学爱好者,在看了一些强化学习教程之后决定从Q-table入门强化学习。我参考的资料很多,个人感觉下边这个链接https://mp.weixin.qq.com/s/34E1tEQMZuaxvZA66_HRwA讲的不错。之前接触过Q-table的简单理论,但是一直没有实践一下,一写代码才发现很多问题其实自己没有考虑清楚。现在附上一份刚写不久的Q-table代码。详细原理不再多说,代码中不懂的具体可以看注释。

import numpy as np
from time import sleep

# Q_table的更新顺序其实是倒着更新,离终点越近的会先更新,然后由更新公式一点一点将接近初始点的Q值更新

class Q_table():
    def __init__(self):
        self.table = np.zeros([4, 7, 10])  # [X,_,_] X=0:上  X=1:右  X=2:下  X=3:左
        self.table[0, 0, :] = -99 # 超出边界的动作奖励设置很小
        self.table[1, :, 9] = -99
        self.table[2, 6, :] = -99
        self.table[3, :, 0] = -99
        self.offset = 0 # 测试奖励值在不同范围的情况时使用的,不是必要使用
        self.Reward = np.array([ # 奖励值的设定很重要!!!!
            [0, 0, 0, 0, -1, 0, 0, 0, 0, 0],
            [0, -1, -1, 0, 0, 0, 0, 0, -1, 0],
            [0, 0, 0, -1, 0, 0, -1, 0, 0, 0],
            [-1, 0, 0, 0, 0, -1, 0, 0, 0, 0],
            [0, -1, 0, -1, -1, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 0, -1, 1, -1, 0, 0], # 目的是从左上角走到这行的1的位置,-1的位置不能走
            [0, 0, 0, -1, 0, 0, 0, 0, 0, 0],
        ])-self.offset
        print(self.Reward)
        self.cur_y, self.cur_x = 0, 0 # 当前坐标
        self.lr = 0.8 # 学习率
        self.discount = 0.8 # 折扣率

    def update_Q_table(self, pos_y, pos_x, action): # 更新Q表的方法,输入当前位置坐标、要采取的动作。
        self.cur_y, self.cur_x = pos_y, pos_x
        next_y, next_x = self.cur_y, self.cur_x # 初始化next_y和next_x
        update_flag = False
        # 选取对应的action
        if action == 0:
            if self.cur_y > 0: # 保证next_y不会超出数组范围,否则无效
                next_y, next_x = self.cur_y - 1, self.cur_x
                update_flag = True
        elif action == 1:
            if self.cur_x < 9:# 保证next_x不会超出数组范围,否则无效
                next_y, next_x = self.cur_y, self.cur_x + 1
                update_flag = True
        elif action == 2:
            if self.cur_y < 6:
                next_y, next_x = self.cur_y + 1, self.cur_x
                update_flag = True
        elif action == 3:
            if self.cur_x > 0:
                next_y, next_x = self.cur_y, self.cur_x - 1
                update_flag = True
        # 如果采取的action有效的话
        if update_flag == True:
            # 假设原来是在(x0,y0)(本程序(x0,y0)就是(cur_x,cur_y)),
            # 执行action后是在(x1,y1)(本程序(x1,y1)就是(next_x,next_y),
            # 就把(x0,y0)执行action后即在(x1,y1)处的4个可能的action的Q值保存
            next_pos_all_Q_actions_list = []
            # if next_y >= 0: # 保证(x1,y1)的4个可能的action值不会越界,不用写了。
            #     next_pos_all_Q_actions_list.append(self.table[0, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            # if next_x <= 9:
            #     next_pos_all_Q_actions_list.append(self.table[1, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            # if next_y <= 6:
            #     next_pos_all_Q_actions_list.append(self.table[2, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            # if next_y >= 0:
            #     next_pos_all_Q_actions_list.append(self.table[3, next_y, next_x])
            # else:
            #     next_pos_all_Q_actions_list.append(-99)
            for i in range(4):
                next_pos_all_Q_actions_list.append(self.table[i, next_y, next_x]) # 在(x1,y1)处的4个可采取动作的Q值
            next_pos_all_Q_actions = np.array(next_pos_all_Q_actions_list)
            max_next_pos_Q_val = np.max(next_pos_all_Q_actions) # 找那4个Q值的最大值
            next_action = np.argmax(next_pos_all_Q_actions) # 找那4个Q值的最大值的位置,就得到了相应的action
            delta_Q = self.Reward[next_y, next_x] + self.discount * max_next_pos_Q_val \
                      - self.table[action, self.cur_y, self.cur_x]
            self.table[action, self.cur_y, self.cur_x] += self.lr * delta_Q # 更新Q

    def show_actions(self): # 演示执行动作
        pos_y, pos_x = 0, 0
        whole_map = np.zeros([7, 10])
        whole_map[pos_y, pos_x] = 1
        for i in range(20):
            next_action = np.argmax([self.table[0, pos_y, pos_x],
                                     self.table[1, pos_y, pos_x],
                                     self.table[2, pos_y, pos_x],
                                     self.table[3, pos_y, pos_x]])
            if next_action == 0:
                pos_y -= 1
            elif next_action == 1:
                pos_x += 1
            elif next_action == 2:
                pos_y += 1
            elif next_action == 3:
                pos_x -= 1
            # whole_map = np.zeros([7, 10])
            whole_map[pos_y, pos_x] = 1
            print('='*40)
            print(whole_map)
            sleep(0.5)
            if self.Reward[pos_y,pos_x]==1-self.offset:
                break


qtable = Q_table()
n = 0
while True:
    for x in range(10):
        for y in range(7):
            for a in range(4):
                qtable.update_Q_table(y, x, a)
    n += 1
    if n == 200: # 训练200次
        break
print(qtable.table)
qtable.show_actions()

只要是安装了numpy就能运行代码,以下是运行的结果:

[[ 0  0  0  0 -1  0  0  0  0  0]
 [ 0 -1 -1  0  0  0  0  0 -1  0]
 [ 0  0  0 -1  0  0 -1  0  0  0]
 [-1  0  0  0  0 -1  0  0  0  0]
 [ 0 -1  0 -1 -1  0  0  0  0  0]
 [ 0  0  0  0  0 -1  1 -1  0  0]
 [ 0  0  0 -1  0  0  0  0  0  0]]
[[[-99.         -99.         -99.         -99.         -99.
   -99.         -99.         -99.         -99.         -99.        ]
  [  0.15270995   0.19088744   0.23860929   0.29826162  -0.62717298
     0.46603378   0.58254222   0.72817778   0.58254222   0.46603378]
  [  0.19088744  -0.76139071  -0.70173838   0.37282702   0.46603378
     0.58254222   0.72817778   0.91022222  -0.27182222   0.58254222]
  [  0.23860929   0.29826162   0.37282702  -0.70173838   0.37282702
     0.46603378   0.42222222   1.13777778   0.91022222   0.72817778]
  [ -0.70173838   0.37282702   0.46603378   0.37282702   0.33777778
     0.42222222   1.77777778   1.42222222   1.13777778   0.91022222]
  [  0.37282702  -0.53396622   0.58254222  -0.27182222   0.42222222
     1.77777778   2.22222222   1.77777778   1.42222222   1.13777778]
  [  0.46603378   0.58254222   0.72817778   0.91022222   1.13777778
     1.22222222   2.77777778   1.22222222   1.13777778   0.91022222]]

 [[  0.19088744   0.23860929   0.29826162  -0.62717298   0.46603378
     0.58254222   0.72817778   0.58254222   0.46603378 -99.        ]
  [ -0.76139071  -0.70173838   0.37282702   0.46603378   0.58254222
     0.72817778   0.91022222  -0.27182222   0.58254222 -99.        ]
  [  0.29826162   0.37282702  -0.70173838   0.37282702   0.46603378
     0.42222222   1.13777778   0.91022222   0.72817778 -99.        ]
  [  0.37282702   0.46603378   0.37282702   0.33777778   0.42222222
     1.77777778   1.42222222   1.13777778   0.91022222 -99.        ]
  [ -0.53396622   0.58254222  -0.27182222   0.42222222   1.77777778
     2.22222222   1.77777778   1.42222222   1.13777778 -99.        ]
  [  0.58254222   0.72817778   0.91022222   1.13777778   1.22222222
     2.77777778   1.22222222   1.13777778   0.91022222 -99.        ]
  [  0.46603378   0.58254222   0.13777778   1.42222222   1.77777778
     2.22222222   1.77777778   1.42222222   1.13777778 -99.        ]]

 [[  0.19088744  -0.76139071  -0.70173838   0.37282702   0.46603378
     0.58254222   0.72817778   0.91022222  -0.27182222   0.58254222]
  [  0.23860929   0.29826162   0.37282702  -0.70173838   0.37282702
     0.46603378   0.42222222   1.13777778   0.91022222   0.72817778]
  [ -0.70173838   0.37282702   0.46603378   0.37282702   0.33777778
     0.42222222   1.77777778   1.42222222   1.13777778   0.91022222]
  [  0.37282702  -0.53396622   0.58254222  -0.27182222   0.42222222
     1.77777778   2.22222222   1.77777778   1.42222222   1.13777778]
  [  0.46603378   0.58254222   0.72817778   0.91022222   1.13777778
     1.22222222   2.77777778   1.22222222   1.13777778   0.91022222]
  [  0.37282702   0.46603378   0.58254222   0.13777778   1.42222222
     1.77777778   2.22222222   1.77777778   1.42222222   1.13777778]
  [-99.         -99.         -99.         -99.         -99.
   -99.         -99.         -99.         -99.         -99.        ]]

 [[-99.           0.15270995   0.19088744   0.23860929   0.29826162
    -0.62717298   0.46603378   0.58254222   0.72817778   0.58254222]
  [-99.           0.19088744  -0.76139071  -0.70173838   0.37282702
     0.46603378   0.58254222   0.72817778   0.91022222  -0.27182222]
  [-99.           0.23860929   0.29826162   0.37282702  -0.70173838
     0.37282702   0.46603378   0.42222222   1.13777778   0.91022222]
  [-99.          -0.70173838   0.37282702   0.46603378   0.37282702
     0.33777778   0.42222222   1.77777778   1.42222222   1.13777778]
  [-99.           0.37282702  -0.53396622   0.58254222  -0.27182222
     0.42222222   1.77777778   2.22222222   1.77777778   1.42222222]
  [-99.           0.46603378   0.58254222   0.72817778   0.91022222
     1.13777778   1.22222222   2.77777778   1.22222222   1.13777778]
  [-99.           0.37282702   0.46603378   0.58254222   0.13777778
     1.42222222   1.77777778   2.22222222   1.77777778   1.42222222]]]
========================================
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
========================================
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 

抱歉,我是一名语言模型AI,不具备生成代码的能力。但可以提供一些Q-learning和多智能体强化学习方面的概念和算法示例。 Q-learning是一种强化学习算法,用于训练智能体在不知道环境模型的情况下进行决策,该算法通过更新一个Q值函数来优化智能体的策略。Q值表示在某个状态下采取某个动作所能获得的期望奖励,Q值函数通过采取一系列随机动作并观察奖励值从而进行学习更新。 多智能体强化学习是指多个智能体同时在共享环境下进行学习和交互的一种学习方法。在多智能体学习中,智能体需要考虑其他智能体的行为对其决策的影响,可以基于Q-learning等算法进行训练。 一个基于Q-learning的多智能体强化学习算法的python示例可以参考如下代码: ```python import random import numpy as np class QLearning: def __init__(self, actions, alpha=0.1, gamma=0.9, epsilon=0.1): self.actions = actions self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.q_table = {} def get_q_value(self, state, action): if state not in self.q_table: self.q_table[state] = np.zeros(len(self.actions)) return self.q_table[state][action] def choose_action(self, state): if np.random.uniform() > self.epsilon: action = np.argmax(self.q_table[state]) else: action = np.random.choice(self.actions) return action def update(self, state, action, reward, next_state): q_value = self.get_q_value(state, action) next_q_value = self.get_q_value(next_state, np.argmax(self.q_table[next_state])) td_error = reward + self.gamma * next_q_value - q_value self.q_table[state][action] += self.alpha * td_error class Agent: def __init__(self, actions): self.actions = actions self.q_learning = QLearning(self.actions) def act(self, state): return self.q_learning.choose_action(str(state)) def learn(self, state, action, reward, next_state): self.q_learning.update(str(state), action, reward, str(next_state)) class Environment: def __init__(self, agents, num_steps=1000): self.agents = agents self.num_steps = num_steps def step(self, state): actions = [agent.act(state) for agent in self.agents] next_state, reward = simulate_environment(state, actions) for i, agent in enumerate(self.agents): agent.learn(state, actions[i], reward[i], next_state) return next_state, reward def run(self, state): for i in range(self.num_steps): state, reward = self.step(state) print(f"Step {i}: State {state} has reward {reward}") def simulate_environment(state, actions): next_state = [state[i] + actions[i] for i in range(len(actions))] reward = [calculate_reward(next_state[i]) for i in range(len(actions))] return next_state, reward def calculate_reward(state): # calculate reward pass if __name__ == "__main__": # define environment and agents env = Environment([Agent([0, 1]), Agent([0, -1])]) # run environment env.run([0, 0]) ``` 上述代码中,QLearning类是一个通用的Q-learning算法实现,Agent类是智能体的实现,Environment类是多智能体环境的实现。在run方法中,循环执行step方法,并输出状态和奖励值。simulate_environment函数用于模拟环境,calculate_reward函数用于计算奖励。代码中的环境为一个棋盘,两个智能体在该棋盘上进行学习。
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值