4.1、DP——Policy Iteration和Value Iteration代码实现


Policy Iteration算法实现:gridworld环境

 
在这里插入图片描述

在这里插入图片描述 

策略迭代结果:
在这里插入图片描述
 

class GridWorld:
    def __init__(self, number):
        self.states = [i for i in range(number**2)]
        self.terminate_states = [0, number**2-1]
        self.actions = ['up', 'down', 'left', 'right']
        self.gamma = 1

        # 初始化V(s)
        self.v_s = dict()   # dict() 函数用于创建一个字典
        for s in self.states:
            self.v_s[s] = 0.0

        # 初始化pi(s)
        self.pi_s = dict()
        for s in self.states:
            self.pi_s[s] =[]
            if s in self.terminate_states:
                continue
            self.pi_s[s].append("up")
            self.pi_s[s].append("down")
            self.pi_s[s].append("left")
            self.pi_s[s].append("right")

        # 初始化最优值函数 pi_optimal(s)
        self.pi_optimal = self.pi_s.copy()

    def state_to_grid(self, s):
        # 将状态转换成网格中的坐标位置
        row = s // 4
        col = s % 4
        return row, col

    def grid_to_state(self, row, col):
        return row*4 + col

    def action(self, s, action):
        if s in self.terminate_states:
            return s, 0
        row, col = self.state_to_grid(s)
        reward = -1
        if action == "up":
            row = row - 1
            if row < 0: row = 0
        if action == "down":
            row = row + 1
            if row > 3: row = 3
        if action == "left":
            col = col - 1
            if col < 0: col = 0
        if action == "right":
            col = col + 1
            if col > 3: col = 3
        state_ = self.grid_to_state(row, col)
        return state_, reward

    def policy_evaluation(self, theta):
        print("------- k = 0 --------")
        self.showV(self.v_s)
        for i in range(10000):
            delta = 0
            lastV = self.v_s.copy()  # 上一个V(s)
            for s in self.states:
                if s in self.terminate_states:
                    continue

                v = lastV[s]

                action = self.pi_s[s]
                nextV = 0.0
                for a in action:
                    s_, r = self.action(s, a)
                    nextV += (r + self.gamma * lastV[s_]) / 4
                self.v_s[s] = nextV
                l1 = [delta, abs(v - self.v_s[s])]
                delta = max(l1)
            print("------- k = %d --------" % (i + 1))
            self.showV(self.v_s)
            if delta < theta:
                break

    def showV(self, v_s):
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[0], v_s[1], v_s[2], v_s[3]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[4], v_s[5], v_s[6], v_s[7]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[8], v_s[9], v_s[10], v_s[11]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[12], v_s[13], v_s[14], v_s[15]))
        print("-----------------------")

    def policy_improvement(self):
        policy_stable = True
        for s in self.states:
            if s in self.terminate_states:
                continue

            old_action = self.pi_optimal[s]
            max_v = float('-inf')
            for a in self.actions:
                s_, r = self.action(s, a)
                v = (r + self.gamma * self.v_s[s_]) / 4
                if v > max_v:
                    max_v = v
                    self.pi_optimal[s] = [a]
                if v == max_v and a not in self.pi_optimal[s]:
                    self.pi_optimal[s].append(a)

            if old_action != self.pi_optimal[s]:
                policy_stable = False
            print(s, self.pi_optimal[s])
        return policy_stable


if __name__ == '__main__':
    env = GridWorld(4)
    theta = 0.00001

    for _ in range(100):
        # policy evaluation
        env.policy_evaluation(theta)

        # policy improvement
        policy_stable = env.policy_improvement()
        print(policy_stable)

        if policy_stable:
            break

 
 
 


Value Iteration算法实现

 
在这里插入图片描述 
 

class GridWorld:
    def __init__(self, number):
        self.states = [i for i in range(number**2)]
        self.terminate_states = [0, number**2-1]
        self.actions = ['up', 'down', 'left', 'right']
        self.gamma = 1

        # 初始化V(s)
        self.v_s = dict()   # dict() 函数用于创建一个字典
        for s in self.states:
            self.v_s[s] = 0.0

        # 初始化pi(s)
        self.pi_s = dict()
        for s in self.states:
            self.pi_s[s] =[]
            if s in self.terminate_states:
                continue
            self.pi_s[s].append("up")
            self.pi_s[s].append("down")
            self.pi_s[s].append("left")
            self.pi_s[s].append("right")

        # 初始化最优值函数 pi_optimal(s)
        self.pi_optimal = self.pi_s.copy()

    def state_to_grid(self, s):
        # 将状态转换成网格中的坐标位置
        row = s // 4
        col = s % 4
        return row, col

    def grid_to_state(self, row, col):
        return row*4 + col

    def action(self, s, action):
        if s in self.terminate_states:
            return s, 0
        row, col = self.state_to_grid(s)
        reward = -1
        if action == "up":
            row = row - 1
            if row < 0: row = 0
        if action == "down":
            row = row + 1
            if row > 3: row = 3
        if action == "left":
            col = col - 1
            if col < 0: col = 0
        if action == "right":
            col = col + 1
            if col > 3: col = 3
        state_ = self.grid_to_state(row, col)
        return state_, reward

    def value_iteration(self, theta):
        print("------- k = 0 --------")
        self.showV(self.v_s)
        for i in range(10000):
            delta = 0
            lastV = self.v_s.copy()  # 上一个V(s)
            for s in self.states:
                if s in self.terminate_states:
                    continue

                v = lastV[s]

                action = self.pi_s[s]
                nextMaxV = float('-inf')
                for a in action:
                    s_, r = self.action(s, a)
                    nextV = (r + self.gamma * lastV[s_]) / 4
                    if nextV > nextMaxV:
                        nextMaxV = nextV
                        self.pi_optimal[s] = [a]
                    if nextV == nextMaxV and a not in self.pi_optimal[s]:
                        self.pi_optimal[s].append(a)

                self.v_s[s] = nextMaxV
                l1 = [delta, abs(v - self.v_s[s])]
                delta = max(l1)
            print("------- k = %d --------" % (i + 1))
            self.showV(self.v_s)
            if delta < theta:
                print("------- output an optimal policy --------")
                print(self.pi_optimal)
                break

    def showV(self, v_s):
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[0], v_s[1], v_s[2], v_s[3]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[4], v_s[5], v_s[6], v_s[7]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[8], v_s[9], v_s[10], v_s[11]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[12], v_s[13], v_s[14], v_s[15]))
        print("-----------------------")


if __name__ == '__main__':
    env = GridWorld(4)
    theta = 0.00001

    # value iteration
    env.value_iteration(theta)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值