[MDP]gridword

最新推荐文章于 2022-09-09 22:03:14 发布
panbaoran913
最新推荐文章于 2022-09-09 22:03:14 发布
阅读量164
点赞数
分类专栏：资料积累文章标签： MDP
本文链接：https://blog.csdn.net/panbaoran913/article/details/121593092
版权
马尔科夫决策过程价值迭代随机策略线性方程组图形表示
关键词由CSDN通过智能技术生成
资料积累专栏收录该内容
22 篇文章 7 订阅
订阅专栏
#######################################################################
# Copyright (C)                                                       #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')   #因为没有 GUI或者 使用的 操作方法 不支持 画图弹窗的传输显示，就会报错。那么加上去避免报错发生！运行成功之后不会显示图形，直接一行消息反馈你，挺好的！

WORLD_SIZE = 5
A_POS = [0, 1]
A_PRIME_POS = [4, 1]
B_POS = [0, 3]
B_PRIME_POS = [2, 3]
DISCOUNT = 0.9

# left, up, right, down
ACTIONS = [np.array([0, -1]),
           np.array([-1, 0]),
           np.array([0, 1]),
           np.array([1, 0])]
ACTIONS_FIGS=[ '←', '↑', '→', '↓']


ACTION_PROB = 0.25


def step(state, action):  
    # 当前状态，动作的下一状态即立时报酬
    
    # 特殊报酬
    if state == A_POS:
        return A_PRIME_POS, 10
    if state == B_POS:
        return B_PRIME_POS, 5

    next_state = (np.array(state) + action).tolist()
    # 列表的相加是长度,所以位置索引要转化为numpy数组进行加减，tolist()是将数组转化为列表

  
    x, y = next_state
    if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:  # 如果碰壁，报酬为-1，下一个状态为当前状态
        reward = -1.0 
        next_state = state
    else:
        reward = 0
    return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):

        # add state labels
        if [i, j] == A_POS:
            val = str(val) + " (A)"
        if [i, j] == A_PRIME_POS:
            val = str(val) + " (A')"
        if [i, j] == B_POS:
            val = str(val) + " (B)"
        if [i, j] == B_PRIME_POS:
            val = str(val) + " (B')"
        
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')
        

    # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')

    ax.add_table(tb)

    
def draw_policy(optimal_values):
    # 在对应位置用箭头画出策略
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = optimal_values.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells 获得每个单元格的最佳动作，并设置符号，并创建单元格
    for (i, j), val in np.ndenumerate(optimal_values):
        next_vals=[]
        for action in ACTIONS:
            next_state, _ = step([i, j], action) # ((i,j) action)的下一状态和报酬
            next_vals.append(optimal_values[next_state[0],next_state[1]]) # 将下一个状态的值放入next_vals

        best_actions=np.where(next_vals == np.max(next_vals))[0] # 下一个状态的值取最大值
        # np.where(next_vals == np.max(next_vals)) 表示最大值的索引,并注意到最大值的索引不唯一
        val=''
        for ba in best_actions:
            val+=ACTIONS_FIGS[ba]
        
        # add state labels
        if [i, j] == A_POS:
            val = str(val) + " (A)"
        if [i, j] == A_PRIME_POS:
            val = str(val) + " (A')"
        if [i, j] == B_POS:
            val = str(val) + " (B)"
        if [i, j] == B_PRIME_POS:
            val = str(val) + " (B')"
        
        tb.add_cell(i, j, width, height, text=val,
                loc='center', facecolor='white')
    
    # Row and column labels... 设置行列标签
    for i in range(len(optimal_values)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                   edgecolor='none', facecolor='none')

    ax.add_table(tb)


def figure_3_2():
    # random -policy 并画出值函数
    value = np.zeros((WORLD_SIZE, WORLD_SIZE))
    while True:
        # keep iteration until convergence # 保持迭代直到收敛
        new_value = np.zeros_like(value)
        # np.zeros_like(x)构造一个同x相同形状的全0数组
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                for action in ACTIONS:  # 当前（状态，动作）的下一状态和报酬
                    (next_i, next_j), reward = step([i, j], action)
                    # bellman equation 随机决策的期望值
                    new_value[i, j] += ACTION_PROB * (reward + DISCOUNT * value[next_i, next_j]) # ACTION_PROB=0.5
        if np.sum(np.abs(value - new_value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('./images/figure_3_2.png')
            plt.close()
            break
        value = new_value

def figure_3_2_linear_system():
    '''
    Here we solve the linear system of equations to find the exact solution. 在这里，我们求解线性方程组以找到精确解。
    We do this by filling the coefficients for each of the states with their respective right side constant.
    我们通过用它们各自的右侧常数填充每个状态的系数来做到这一点。
    '''
    A = -1 * np.eye(WORLD_SIZE * WORLD_SIZE) # 25*25的单位矩阵*（-1）
    b = np.zeros(WORLD_SIZE * WORLD_SIZE)    # 25*1的全0数组
    for i in range(WORLD_SIZE):
        for j in range(WORLD_SIZE):
            s = [i, j]  # current state
            index_s = np.ravel_multi_index(s, (WORLD_SIZE, WORLD_SIZE)) # 将（5,5）数组中的索引(i,j)转化成一维数组的索引index_s
            for a in ACTIONS:
                s_, r = step(s, a)
                index_s_ = np.ravel_multi_index(s_, (WORLD_SIZE, WORLD_SIZE)) # 同上，化为一维数组的索引

                A[index_s, index_s_] += ACTION_PROB * DISCOUNT  # Action_PROB=0.25， DISCOUNT=0.9  在对角线上的位置
                b[index_s] -= ACTION_PROB * r #r是(状态，动作)后的立时报酬

    x = np.linalg.solve(A, b) #以矩阵形式解一个线性矩阵方程，或线性标量方程组
    draw_image(np.round(x.reshape(WORLD_SIZE, WORLD_SIZE), decimals=2))
    plt.savefig('./images/figure_3_2_linear_system.png')
    plt.close()

def figure_3_5():
    value = np.zeros((WORLD_SIZE, WORLD_SIZE))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                values = []
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    # value iteration
                    values.append(reward + DISCOUNT * value[next_i, next_j])
                new_value[i, j] = np.max(values)
        if np.sum(np.abs(new_value - value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('./images/figure_3_5.png')
            plt.close()
            draw_policy(new_value)
            plt.savefig('./images/figure_3_5_policy.png')
            plt.close()
            break
        value = new_value


if __name__ == '__main__':
    import os
    #os.mkdir('images')
    figure_3_2_linear_system()
    figure_3_2()
    figure_3_5()
    ```