[MDP]gridword

#######################################################################
# Copyright (C)                                                       #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table

matplotlib.use('Agg')   #因为没有 GUI或者 使用的 操作方法 不支持 画图弹窗的传输显示,就会报错。那么加上去避免报错发生!运行成功之后不会显示图形,直接一行消息反馈你,挺好的!

WORLD_SIZE = 5
A_POS = [0, 1]
A_PRIME_POS = [4, 1]
B_POS = [0, 3]
B_PRIME_POS = [2, 3]
DISCOUNT = 0.9

# left, up, right, down
ACTIONS = [np.array([0, -1]),
           np.array([-1, 0]),
           np.array([0, 1]),
           np.array([1, 0])]
ACTIONS_FIGS=[ '←', '↑', '→', '↓']


ACTION_PROB = 0.25


def step(state, action):  
    # 当前状态,动作的下一状态即立时报酬
    
    # 特殊报酬
    if state == A_POS:
        return A_PRIME_POS, 10
    if state == B_POS:
        return B_PRIME_POS, 5

    next_state = (np.array(state) + action).tolist()
    # 列表的相加是长度,所以位置索引要转化为numpy数组进行加减,tolist()是将数组转化为列表

  
    x, y = next_state
    if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE:  # 如果碰壁,报酬为-1,下一个状态为当前状态
        reward = -1.0 
        next_state = state
    else:
        reward = 0
    return next_state, reward


def draw_image(image):
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = image.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells
    for (i, j), val in np.ndenumerate(image):

        # add state labels
        if [i, j] == A_POS:
            val = str(val) + " (A)"
        if [i, j] == A_PRIME_POS:
            val = str(val) + " (A')"
        if [i, j] == B_POS:
            val = str(val) + " (B)"
        if [i, j] == B_PRIME_POS:
            val = str(val) + " (B')"
        
        tb.add_cell(i, j, width, height, text=val,
                    loc='center', facecolor='white')
        

    # Row and column labels...
    for i in range(len(image)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                    edgecolor='none', facecolor='none')

    ax.add_table(tb)

    
def draw_policy(optimal_values):
    # 在对应位置用箭头画出策略
    fig, ax = plt.subplots()
    ax.set_axis_off()
    tb = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = optimal_values.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells 获得每个单元格的最佳动作,并设置符号,并创建单元格
    for (i, j), val in np.ndenumerate(optimal_values):
        next_vals=[]
        for action in ACTIONS:
            next_state, _ = step([i, j], action) # ((i,j) action)的下一状态和报酬
            next_vals.append(optimal_values[next_state[0],next_state[1]]) # 将下一个状态的值放入next_vals

        best_actions=np.where(next_vals == np.max(next_vals))[0] # 下一个状态的值取最大值
        # np.where(next_vals == np.max(next_vals)) 表示最大值的索引,并注意到最大值的索引不唯一
        val=''
        for ba in best_actions:
            val+=ACTIONS_FIGS[ba]
        
        # add state labels
        if [i, j] == A_POS:
            val = str(val) + " (A)"
        if [i, j] == A_PRIME_POS:
            val = str(val) + " (A')"
        if [i, j] == B_POS:
            val = str(val) + " (B)"
        if [i, j] == B_PRIME_POS:
            val = str(val) + " (B')"
        
        tb.add_cell(i, j, width, height, text=val,
                loc='center', facecolor='white')
    
    # Row and column labels... 设置行列标签
    for i in range(len(optimal_values)):
        tb.add_cell(i, -1, width, height, text=i+1, loc='right',
                    edgecolor='none', facecolor='none')
        tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
                   edgecolor='none', facecolor='none')

    ax.add_table(tb)


def figure_3_2():
    # random -policy 并画出值函数
    value = np.zeros((WORLD_SIZE, WORLD_SIZE))
    while True:
        # keep iteration until convergence # 保持迭代直到收敛
        new_value = np.zeros_like(value)
        # np.zeros_like(x)构造一个同x相同形状的全0数组
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                for action in ACTIONS:  # 当前(状态,动作)的下一状态和报酬
                    (next_i, next_j), reward = step([i, j], action)
                    # bellman equation 随机决策的期望值
                    new_value[i, j] += ACTION_PROB * (reward + DISCOUNT * value[next_i, next_j]) # ACTION_PROB=0.5
        if np.sum(np.abs(value - new_value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('./images/figure_3_2.png')
            plt.close()
            break
        value = new_value

def figure_3_2_linear_system():
    '''
    Here we solve the linear system of equations to find the exact solution. 在这里,我们求解线性方程组以找到精确解。
    We do this by filling the coefficients for each of the states with their respective right side constant.
    我们通过用它们各自的右侧常数填充每个状态的系数来做到这一点。
    '''
    A = -1 * np.eye(WORLD_SIZE * WORLD_SIZE) # 25*25的单位矩阵*(-1)
    b = np.zeros(WORLD_SIZE * WORLD_SIZE)    # 25*1的全0数组
    for i in range(WORLD_SIZE):
        for j in range(WORLD_SIZE):
            s = [i, j]  # current state
            index_s = np.ravel_multi_index(s, (WORLD_SIZE, WORLD_SIZE)) # 将(5,5)数组中的索引(i,j)转化成一维数组的索引index_s
            for a in ACTIONS:
                s_, r = step(s, a)
                index_s_ = np.ravel_multi_index(s_, (WORLD_SIZE, WORLD_SIZE)) # 同上,化为一维数组的索引

                A[index_s, index_s_] += ACTION_PROB * DISCOUNT  # Action_PROB=0.25, DISCOUNT=0.9  在对角线上的位置
                b[index_s] -= ACTION_PROB * r #r是(状态,动作)后的立时报酬

    x = np.linalg.solve(A, b) #以矩阵形式解一个线性矩阵方程,或线性标量方程组
    draw_image(np.round(x.reshape(WORLD_SIZE, WORLD_SIZE), decimals=2))
    plt.savefig('./images/figure_3_2_linear_system.png')
    plt.close()

def figure_3_5():
    value = np.zeros((WORLD_SIZE, WORLD_SIZE))
    while True:
        # keep iteration until convergence
        new_value = np.zeros_like(value)
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                values = []
                for action in ACTIONS:
                    (next_i, next_j), reward = step([i, j], action)
                    # value iteration
                    values.append(reward + DISCOUNT * value[next_i, next_j])
                new_value[i, j] = np.max(values)
        if np.sum(np.abs(new_value - value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.savefig('./images/figure_3_5.png')
            plt.close()
            draw_policy(new_value)
            plt.savefig('./images/figure_3_5_policy.png')
            plt.close()
            break
        value = new_value


if __name__ == '__main__':
    import os
    #os.mkdir('images')
    figure_3_2_linear_system()
    figure_3_2()
    figure_3_5()
    ```

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值