PolicyIterationSolution

# coding: utf-8
#geling修改注释 20180421
#liuyubiao修改策略输出为多策略输出
import numpy as np
import pprint
import sys
import PolicyEvaluationSolution

if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv

# 进行多策略的输出

# 定义两个全局变量用来记录运算的次数
v_num = 1
i_num = 1

# 根据传入的四个行为选择值函数最大的索引,返回的是一个索引数组和一个行为策略
def get_max_index(action_values):
    indexs = []
    policy_arr = np.zeros(len(action_values))

    action_max_value = np.max(action_values)

    for i in range(len(action_values)):
        action_value = action_values[i]

        if action_value == action_max_value:
            indexs.append(i)
            policy_arr[i] = 1
    return indexs,policy_arr

# 将策略中的每行可能行为改成元组形式,方便对多个方向的表示
def change_policy(policys):
    action_tuple = []

    for policy in policys:
        indexs, policy_arr = get_max_index(policy)
        action_tuple.append(tuple(indexs))

    return action_tuple
#policy_improvement是策略迭代方法,输入为环境env,策略评估函数policy_eval_,折扣因子。输出为最优值函数和最优策略
# policy_eval_fn=PolicyEvaluationSolution.policy_eval表示对PolicyEvaluationSolution文件的policy_eval方法进行调用。
def policy_improvement(env, policy_eval_fn=PolicyEvaluationSolution.policy_eval, discount_factor=1.0):
    """
    Policy Improvement Algorithm. Iteratively evaluates and improves a policy
    until an optimal policy is found.
    
    Args:
        env: The OpenAI envrionment.
        policy_eval_fn: Policy Evaluation function that takes 3 arguments:
            policy, env, discount_factor.
        discount_factor: gamma discount factor.
        
    Returns:
        A tuple (policy, V). 
        policy is the optimal policy, a matrix of shape [S, A] where each state s
        contains a valid probability distribution over actions.
        V is the value function for the optimal policy.
        
    """
    # 初始化一个随机策略
    policy = np.ones([env.nS, env.nA]) / env.nA

    print("初始的随机策略")
    print(policy)
    print("*"*50)

    while True:
        global i_num
        global v_num

        v_num = 1
        # 评估当前的策略,输出为各状态的当前的状态值函数
        V = policy_eval_fn(policy, env, discount_factor)

        print("第%d次策略提升时求出的各状态值函数"%i_num)
        print(V)

        print("")
        
        # 定义一个当前策略是否改变的标识
        policy_stable = True
        
        # 遍历各状态
        for s in range(env.nS):
            # 取出当前状态下最优行为的索引值
            chosen_a = np.argmax(policy[s])
            
            # 初始化行为数组[0,0,0,0]
            action_values = np.zeros(env.nA)
            for a in range(env.nA):
                # 遍历各行为
                for prob, next_state, reward, done in env.P[s][a]:
                    # 根据各状态值函数求出行为值函数
                    action_values[a] += prob * (reward + discount_factor * V[next_state])

            # v1.0版更新内容,因为np.argmax(action_values)只会选取第一个最大值出现的索引,所以会丢掉其他方向的可能性,现在会输出一个状态下所有的可能性
            best_a_arr, policy_arr = get_max_index(action_values)
            
            # 如果求出的最大行为值函数的索引(方向)没有改变,则定义当前策略未改变,收敛输出
            # 否则将当前的状态中将有最大行为值函数的方向置1,其余方向置0
            if chosen_a not in best_a_arr:
                policy_stable = False
            policy[s] = policy_arr

        print("第%d次策略提升结果"%i_num)
        print(policy)
        print("*"*50)

        i_num = i_num + 1
        
        # 如果当前策略没有发生改变,即已经到了最优策略,返回
        if policy_stable:
            print("第%d次之后得到的结果已经收敛,运算结束"%(i_num-1))

            return policy, V


env = GridworldEnv()
policy, v = policy_improvement(env)
print("策略可能的方向值:")
print(policy)
print("")

print("策略网格形式 (0=up, 1=right, 2=down, 3=left):")
# v1.0版本修改:现在输出同一个状态下会有多个最优行为,而argmax只会选取第一个进行,所以需要修改
# print(np.reshape(np.argmax(policy, axis=1), env.shape))
update_policy_type = change_policy(policy)
print(np.reshape(update_policy_type, env.shape))
print("")

print("值函数:")
print(v)
print("")

print("值函数的网格形式:")
print(v.reshape(env.shape))
print("")

# 验证最终求出的值函数符合预期
expected_v = np.array([-4, -3, -2, -1, -2, -3, -2, -1,  0, -1, -4, -3, -2, -1, -2, -5, -4, -3, -2, -3, -6, -5, -4, -3, -4])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)

gridworld.py

import numpy as np
import sys
from gym.envs.toy_text import discrete

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
DONE_LOCATION = 8

class GridworldEnv(discrete.DiscreteEnv):
    """
    Grid World environment from Sutton's Reinforcement Learning book chapter 4.
    You are an agent on an MxN grid and your goal is to reach the terminal
    state at the top left or the bottom right corner.

    For example, a 5x5 grid looks as follows:

    o  o  o  o  o
    o  o  o  T  o   # DONE_LOCATION == 8
    o  o  o  o  o
    o  x  o  o  o
    o  o  o  o  o

    x is your position and T are the two terminal states.

    You can take actions in each direction (UP=0, RIGHT=1, DOWN=2, LEFT=3).
    Actions going off the edge leave you in your current state.
    You receive a reward of -1 at each step until you reach a terminal state.
    """
    def __init__(self, shape=[5,5]):
        if not isinstance(shape, (list, tuple)) or not len(shape) == 2:
            raise ValueError('shape argument must be a list/tuple of length 2')

        self.shape = shape

        nS = np.prod(shape)
        nA = 4

        MAX_Y = shape[0]
        MAX_X = shape[1]

        P = {}
        grid = np.arange(nS).reshape(shape)
        it = np.nditer(grid, flags=['multi_index'])

        while not it.finished:
            s = it.iterindex
            y, x = it.multi_index

            P[s] = {a: [] for a in range(nA)}

            is_done = lambda s: s == DONE_LOCATION
            reward = 0.0 if is_done(s) else -1.0

            # We're stuck in a terminal state
            if is_done(s):
                P[s][UP] = [(1, s, reward, True)]
                P[s][RIGHT] = [(1, s, reward, True)]
                P[s][DOWN] = [(1, s, reward, True)]
                P[s][LEFT] = [(1, s, reward, True)]
            # Not a terminal state
            else:
                ns_up = s if y == 0 else s - MAX_X
                ns_right = s if x == (MAX_X - 1) else s + 1
                ns_down = s if y == (MAX_Y - 1) else s + MAX_X
                ns_left = s if x == 0 else s - 1
                P[s][UP] = [(1, ns_up, reward, is_done(ns_up))]
                P[s][RIGHT] = [(1, ns_right, reward, is_done(ns_right))]
                P[s][DOWN] = [(1, ns_down, reward, is_done(ns_down))]
                P[s][LEFT] = [(1, ns_left, reward, is_done(ns_left))]

            it.iternext()

        # Initial state distribution is uniform
        isd = np.ones(nS) / nS

        # We expose the model of the environment for educational purposes
        # This should not be used in any model-free learning algorithm
        self.P = P

        super(GridworldEnv, self).__init__(nS, nA, P, isd)

PolicyEvaluationSolution.py

# coding: utf-8

import numpy as np
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv

#  policy_eval方法是策略评估方法,输入要评估的策略policy_eval,环境env,折扣因子,阈值。输出当前策略下收敛的值函数v
def policy_eval(policy, env, discount_factor=1, threshold=0.00001):  # discount_factor=0.9
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        threshold: We stop evaluation once our value function change is less than threshold for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    # 初始化各状态的状态值函数
    V = np.zeros(env.nS)
    i = 0
    print("第%d次输出各个状态值为" % i)
    print(V.reshape(5,5))
    print("-"*50)
    while True:
        value_delta = 0
        # 遍历各状态
        for s in range(env.nS):
            v = 0
            # 遍历各行为的概率(上,右,下,左)
            for a, action_prob in enumerate(policy[s]):
                # 对于每个行为确认下个状态
                # 四个参数: prob:概率, next_state: 下一个状态的索引, reward: 回报, done: 是否是终止状态
                for prob, next_state, reward, done in env.P[s][a]:
                    # 使用贝尔曼期望方程进行状态值函数的求解
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
            # 求出各状态和上一次求得状态的最大差值
            value_delta = max(value_delta, np.abs(v - V[s]))
            V[s] = v
        i += 1
        print("第%d次输出各个状态值为"%i)
        print(V.reshape(5,5))
        print("-" * 50)
        # 当前循环得出的各状态和上一次状态的最大差值小于阈值,则收敛停止运算
        if value_delta < threshold:
            print("第%d后,所得结果已经收敛,运算结束"%i)
            break
    return np.array(V)


env = GridworldEnv()
random_policy = np.ones([env.nS, env.nA]) / env.nA
v = policy_eval(random_policy, env)

print("最终值函数:")
print(v)

print("值函数的网格形式:")
print(v.reshape(env.shape))

# 验证最终求出的状态值函数符合预期
expected_v = np.array([-47,-42,-31,-18,-20,-48,-42,-29,0,-18,-51,-47,-39,-29,-31,-54,-52,-47,-43,-42,-57,-55,-52,-48,-47])
np.testing.assert_array_almost_equal(v, expected_v, decimal=0)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值