def policy_iteration(env, policy, dicount_factor=1.0):
while True:
V = policy_evaluation(policy, env, dicount_factor) # 策略评估
policy_stable = True # policy标志位,当某个状态策略变化后,变为false
for s in range(env.nS): # 遍历状态
old_action = np.argmax(policy[s]) # 在当前状态和策略下,选择概率最高的动作
action_values = np.zeros(env.nA) # 初始化该状态下的动作值
for a in range(env.nA):
for prob, next_state, reward, done in env.P[s][a]:
action_values[a] += prob * (reward + dicount_factor * V[next_state])
best_action = np.argmax(action_values)
if old_action != best_action:
policy_stable = False
policy[s] = np.eye(env.nA)[best_action] # 将存在概率的动作策略,变为确定策略,即动作值最大的动作概率置为1
if policy_stable:
return policy, V
2021-10-17 3.5
最新推荐文章于 2024-05-27 22:06:54 发布