悬崖游戏解释
import gym
import numpy as np
env = gym.make('CliffWalking-v0')
print('观察空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('状态数量 = {}, 动作数量 = {}'.format(env.nS, env.nA))
print('地图大小 = {}'.format(env.shape))
def play_once(env, policy):
total_reward = 0
state = env.reset()
while True:
loc = np.unravel_index(state, env.shape)
print('状态 = {}, 位置 = {}'.format(state, loc), end=' ')
action = np.random.choice(env.nA, p=policy[state])
next_state, reward, done, _ = env.step(action)
print('动作 = {}, 奖励 = {}'.format(action, reward))
total_reward += reward
if done:
break
state = next_state
return total_reward
actions = np.ones(env.shape, dtype=int)
actions[-1, :] = 0
actions[:, -1] = 2
optimal_policy = np.eye(4)[actions.reshape(-1)]
total_reward = play_once(env, optimal_policy)
print('总奖励 = {}'.format(total_reward))
相关问题
1、np.unravel_index(state, env.shape)
取state 在env.shape中的位置初始位置为(3,0)
2、np.random.choice(env.nA, p=policy[state])
例:np.random.choice(5)
在[0,5]中选择一个随机数
3、np.random.choice(5, 3)
在[0,5]中选择三个随机数
T = (2, 4, 6, 2)
4、np.random.choice(T, 5)
从T中随机选择五个随机数
5、定义全是1的数组
actions[-1, :] 取数组最后一行
actions[;, -1] 取数组最后一列