理论回顾
[1]. Bellman方程求解
[2]. 3.12 Value Iteration - Frozen Lake Problem.ipynb
[3]. 强化学习中马尔科夫决策过程和贝尔曼方程
''' 值迭代求解冰冻湖 '''
"""
冰冻湖,其中,S是起始位置 F是可通过的冰冻湖 H是必须小心的洞 G是目标
S F F F
F H F H
F F F H
H F F G
目标是找到从S到G的最佳路径且不会陷入H
"""
import gym
import numpy as np
env = gym.make('FrozenLake-v0')
env.render() # 查看环境
def value_iteration(env, gamma = 1.0):
# initialize value table with zeros
value_table = np.zeros(env.observation_space.n)
# set number of iterations and threshold
no_of_iterations = 100000
threshold = 1e-20
for i in range(no_of_iterations):
# On each iteration, copy the value table to the updated_value_table
updated_value_table = np.copy(value_table)
# Now we calculate Q Value for each actions in the state
# and update the value of a state with maximum Q value
for state in range(env.observation_space.n):
Q_value = []
for action in range(env.action_space.n):
next_states_rewards = []
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
next_states_rewards.append((trans_prob * (reward_prob + gamma * updated_value_table[next_state])))
Q_value.append(np.sum(next_states_rewards))
value_table[state] = max(Q_value)
# we will check whether we have reached the convergence i.e whether the difference
# between our value table and updated value table is very small. But how do we know it is very
# small? We set some threshold and then we will see if the difference is less
# than our threshold, if it is less, we break the loop and return the value function as optimal
# value function
if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
print ('Value-iteration converged at iteration# %d.' %(i+1))
break
return value_table
def extract_policy(value_table, gamma=1.0):
# initialize the policy with zeros
policy = np.zeros(env.observation_space.n)
for state in range(env.observation_space.n):
# initialize the Q table for a state
Q_table = np.zeros(env.action_space.n)
# compute Q value for all ations in the state
for action in range(env.action_space.n):
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next_state]))
# select the action which has maximum Q value as an optimal action of the state
policy[state] = np.argmax(Q_table)
return policy
optimal_value_function = value_iteration(env=env, gamma=1.0)
optimal_policy = extract_policy(optimal_value_function, gamma=1.0)
print(optimal_policy)
最优的策略为:array([0, 3, 3, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0])