Value Iteration
之前我们实现了DP中的策略迭代,现在来实现价值迭代从而对它有一个更清晰地认识。
和之前一样我们导入一些必要的包,写好执行策略的函数。
import gym
import numpy as np
from IPython.display import clear_output
import time
import pprint
env = gym.make('FrozenLake-v0')
def execute_policy(env, policy, render = False):
total_rewards, total_steps = 0, 0
state = env.reset()
while True:
if render:
env.render()
clear_output(wait=True)
time.sleep(0.2)
action = np.random.choice(env.nA, p = policy[state])
state, reward, done, _ = env.step(action)
total_rewards += reward
total_steps += 1
if done:
if render:
print('total_steps: ', total_steps)
print('total_rewards: ', total_rewards)
time.sleep(2)
clear_output()
break
return total_rewards, total_steps
和上次有所不同,这里我们专门写了一个将状态价值转换成动作价值的函数,目的是使后面的函数更简洁清晰。
def v_to_q(env, v_table, s, gamma = 1.0):
q_table = np.zeros(env.nA) # shape (4,) shouldn't be np.zeros((1, env.nA))! it's [[0, 0, 0, 0]]
for a in range(env.nA):
q_table[a] = sum([trans_prob * (reward + gamma*v_table[s_]*(1.0-done)) for \
trans_prob, s_, reward, done in env.P[s][a]])
return q_table
接下来是价值迭代的主题部分:
'''
when we use value iteration, there's no need to write policy evaluation and policy improvment.
Each state we actually evaluated the policy once and then use Bellman optimal equation to update state value until meet the threhold condition
'''
def iterate_value(env, threadhold=1e-10, gamma=1.0):
value_table = np.zeros(env.nS)
while True:
delta = 0.0
for s in range(env.nS):
V_max = max(v_to_q(env, value_table, s, gamma))
delta = max(delta, abs(V_max - value_table[s]))
value_table[s] = V_max
if delta < threadhold:
break
policy = np.ones((env.nS, env.nA)) / env.nA
for s in range(env.nS):
a = np.argmax(v_to_q(env, value_table, s, gamma))
policy[s] = 0.0
policy[s][a] = 1.0
return policy, value_table
回顾价值迭代的公式:
v
k
+
1
(
s
)
=
max
a
∑
s
′
,
r
p
(
s
′
,
r
∣
s
,
a
)
[
r
+
γ
v
k
(
s
′
)
]
\begin{aligned}v_{k+1}(s)=\max _{a} \sum_{s^{\prime}, r} p\left(s^{\prime}, r | s, a\right)\left[r+\gamma v_{k}\left(s^{\prime}\right)\right]\end{aligned}
vk+1(s)=amaxs′,r∑p(s′,r∣s,a)[r+γvk(s′)]
算法:
以及这个例子:
def check_performance(env, policy, episodes = 1000):
successed_times, total_steps = 0., 0.
for i in range(episodes):
one_episode_rewards, one_episode_steps = execute_policy(env, policy)
if one_episode_rewards == 1.0:
successed_times += 1.
total_steps += one_episode_steps
return total_steps / episodes, successed_times / episodes
optimal_policy, value_table = iterate_value(env)
# run demo
execute_policy(env, optimal_policy, render=True)
[out]:
(1.0, 33)
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(np.argmax(optimal_policy, axis=1).reshape(4, 4))
print()
pp.pprint(value_table.reshape(4, 4))
[out]:
array([[0, 3, 3, 3],
[0, 0, 0, 0],
[3, 1, 0, 0],
[0, 2, 1, 0]], dtype=int64)
array([[0.82352941, 0.82352941, 0.82352941, 0.82352941],
[0.82352941, 0. , 0.52941176, 0. ],
[0.82352941, 0.82352941, 0.76470588, 0. ],
[0. , 0.88235294, 0.94117647, 0. ]])
ave_steps, acc = check_performance(env, optimal_policy)
print("ave_steps: ", ave_steps)
print("acc: ", acc)
[out]:
ave_steps: 27.906
acc: 0.74
参考资料
《强化学习原理与Python实现》肖智清