#######################################################################
# Copyright (C) #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com) #
# 2016 Kenta Shimada(hyperkentakun@gmail.com) #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table
matplotlib.use('Agg') #因为没有 GUI或者 使用的 操作方法 不支持 画图弹窗的传输显示,就会报错。那么加上去避免报错发生!运行成功之后不会显示图形,直接一行消息反馈你,挺好的!
WORLD_SIZE = 5
A_POS = [0, 1]
A_PRIME_POS = [4, 1]
B_POS = [0, 3]
B_PRIME_POS = [2, 3]
DISCOUNT = 0.9
# left, up, right, down
ACTIONS = [np.array([0, -1]),
np.array([-1, 0]),
np.array([0, 1]),
np.array([1, 0])]
ACTIONS_FIGS=[ '←', '↑', '→', '↓']
ACTION_PROB = 0.25
def step(state, action):
# 当前状态,动作的下一状态即立时报酬
# 特殊报酬
if state == A_POS:
return A_PRIME_POS, 10
if state == B_POS:
return B_PRIME_POS, 5
next_state = (np.array(state) + action).tolist()
# 列表的相加是长度,所以位置索引要转化为numpy数组进行加减,tolist()是将数组转化为列表
x, y = next_state
if x < 0 or x >= WORLD_SIZE or y < 0 or y >= WORLD_SIZE: # 如果碰壁,报酬为-1,下一个状态为当前状态
reward = -1.0
next_state = state
else:
reward = 0
return next_state, reward
def draw_image(image):
fig, ax = plt.subplots()
ax.set_axis_off()
tb = Table(ax, bbox=[0, 0, 1, 1])
nrows, ncols = image.shape
width, height = 1.0 / ncols, 1.0 / nrows
# Add cells
for (i, j), val in np.ndenumerate(image):
# add state labels
if [i, j] == A_POS:
val = str(val) + " (A)"
if [i, j] == A_PRIME_POS:
val = str(val) + " (A')"
if [i, j] == B_POS:
val = str(val) + " (B)"
if [i, j] == B_PRIME_POS:
val = str(val) + " (B')"
tb.add_cell(i, j, width, height, text=val,
loc='center', facecolor='white')
# Row and column labels...
for i in range(len(image)):
tb.add_cell(i, -1, width, height, text=i+1, loc='right',
edgecolor='none', facecolor='none')
tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
edgecolor='none', facecolor='none')
ax.add_table(tb)
def draw_policy(optimal_values):
# 在对应位置用箭头画出策略
fig, ax = plt.subplots()
ax.set_axis_off()
tb = Table(ax, bbox=[0, 0, 1, 1])
nrows, ncols = optimal_values.shape
width, height = 1.0 / ncols, 1.0 / nrows
# Add cells 获得每个单元格的最佳动作,并设置符号,并创建单元格
for (i, j), val in np.ndenumerate(optimal_values):
next_vals=[]
for action in ACTIONS:
next_state, _ = step([i, j], action) # ((i,j) action)的下一状态和报酬
next_vals.append(optimal_values[next_state[0],next_state[1]]) # 将下一个状态的值放入next_vals
best_actions=np.where(next_vals == np.max(next_vals))[0] # 下一个状态的值取最大值
# np.where(next_vals == np.max(next_vals)) 表示最大值的索引,并注意到最大值的索引不唯一
val=''
for ba in best_actions:
val+=ACTIONS_FIGS[ba]
# add state labels
if [i, j] == A_POS:
val = str(val) + " (A)"
if [i, j] == A_PRIME_POS:
val = str(val) + " (A')"
if [i, j] == B_POS:
val = str(val) + " (B)"
if [i, j] == B_PRIME_POS:
val = str(val) + " (B')"
tb.add_cell(i, j, width, height, text=val,
loc='center', facecolor='white')
# Row and column labels... 设置行列标签
for i in range(len(optimal_values)):
tb.add_cell(i, -1, width, height, text=i+1, loc='right',
edgecolor='none', facecolor='none')
tb.add_cell(-1, i, width, height/2, text=i+1, loc='center',
edgecolor='none', facecolor='none')
ax.add_table(tb)
def figure_3_2():
# random -policy 并画出值函数
value = np.zeros((WORLD_SIZE, WORLD_SIZE))
while True:
# keep iteration until convergence # 保持迭代直到收敛
new_value = np.zeros_like(value)
# np.zeros_like(x)构造一个同x相同形状的全0数组
for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
for action in ACTIONS: # 当前(状态,动作)的下一状态和报酬
(next_i, next_j), reward = step([i, j], action)
# bellman equation 随机决策的期望值
new_value[i, j] += ACTION_PROB * (reward + DISCOUNT * value[next_i, next_j]) # ACTION_PROB=0.5
if np.sum(np.abs(value - new_value)) < 1e-4:
draw_image(np.round(new_value, decimals=2))
plt.savefig('./images/figure_3_2.png')
plt.close()
break
value = new_value
def figure_3_2_linear_system():
'''
Here we solve the linear system of equations to find the exact solution. 在这里,我们求解线性方程组以找到精确解。
We do this by filling the coefficients for each of the states with their respective right side constant.
我们通过用它们各自的右侧常数填充每个状态的系数来做到这一点。
'''
A = -1 * np.eye(WORLD_SIZE * WORLD_SIZE) # 25*25的单位矩阵*(-1)
b = np.zeros(WORLD_SIZE * WORLD_SIZE) # 25*1的全0数组
for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
s = [i, j] # current state
index_s = np.ravel_multi_index(s, (WORLD_SIZE, WORLD_SIZE)) # 将(5,5)数组中的索引(i,j)转化成一维数组的索引index_s
for a in ACTIONS:
s_, r = step(s, a)
index_s_ = np.ravel_multi_index(s_, (WORLD_SIZE, WORLD_SIZE)) # 同上,化为一维数组的索引
A[index_s, index_s_] += ACTION_PROB * DISCOUNT # Action_PROB=0.25, DISCOUNT=0.9 在对角线上的位置
b[index_s] -= ACTION_PROB * r #r是(状态,动作)后的立时报酬
x = np.linalg.solve(A, b) #以矩阵形式解一个线性矩阵方程,或线性标量方程组
draw_image(np.round(x.reshape(WORLD_SIZE, WORLD_SIZE), decimals=2))
plt.savefig('./images/figure_3_2_linear_system.png')
plt.close()
def figure_3_5():
value = np.zeros((WORLD_SIZE, WORLD_SIZE))
while True:
# keep iteration until convergence
new_value = np.zeros_like(value)
for i in range(WORLD_SIZE):
for j in range(WORLD_SIZE):
values = []
for action in ACTIONS:
(next_i, next_j), reward = step([i, j], action)
# value iteration
values.append(reward + DISCOUNT * value[next_i, next_j])
new_value[i, j] = np.max(values)
if np.sum(np.abs(new_value - value)) < 1e-4:
draw_image(np.round(new_value, decimals=2))
plt.savefig('./images/figure_3_5.png')
plt.close()
draw_policy(new_value)
plt.savefig('./images/figure_3_5_policy.png')
plt.close()
break
value = new_value
if __name__ == '__main__':
import os
#os.mkdir('images')
figure_3_2_linear_system()
figure_3_2()
figure_3_5()
```
02-25
5388
06-30
4414