4.1、DP——Policy Iteration和Value Iteration代码实现

最新推荐文章于 2024-05-22 10:16:52 发布

Jacob Jiang

最新推荐文章于 2024-05-22 10:16:52 发布

阅读量1k

点赞数

分类专栏： RL 文章标签：强化学习 python

本文链接：https://blog.csdn.net/weixin_42104932/article/details/105569610

版权

RL 专栏收录该内容

28 篇文章 4 订阅

订阅专栏

文章目录

Policy Iteration算法实现：gridworld环境
Value Iteration算法实现

Policy Iteration算法实现：gridworld环境

在这里插入图片描述

策略迭代结果：
在这里插入图片描述

class GridWorld:
    def __init__(self, number):
        self.states = [i for i in range(number**2)]
        self.terminate_states = [0, number**2-1]
        self.actions = ['up', 'down', 'left', 'right']
        self.gamma = 1

        # 初始化V（s）
        self.v_s = dict()   # dict() 函数用于创建一个字典
        for s in self.states:
            self.v_s[s] = 0.0

        # 初始化pi(s)
        self.pi_s = dict()
        for s in self.states:
            self.pi_s[s] =[]
            if s in self.terminate_states:
                continue
            self.pi_s[s].append("up")
            self.pi_s[s].append("down")
            self.pi_s[s].append("left")
            self.pi_s[s].append("right")

        # 初始化最优值函数 pi_optimal(s)
        self.pi_optimal = self.pi_s.copy()

    def state_to_grid(self, s):
        # 将状态转换成网格中的坐标位置
        row = s // 4
        col = s % 4
        return row, col

    def grid_to_state(self, row, col):
        return row*4 + col

    def action(self, s, action):
        if s in self.terminate_states:
            return s, 0
        row, col = self.state_to_grid(s)
        reward = -1
        if action == "up":
            row = row - 1
            if row < 0: row = 0
        if action == "down":
            row = row + 1
            if row > 3: row = 3
        if action == "left":
            col = col - 1
            if col < 0: col = 0
        if action == "right":
            col = col + 1
            if col > 3: col = 3
        state_ = self.grid_to_state(row, col)
        return state_, reward

    def policy_evaluation(self, theta):
        print("------- k = 0 --------")
        self.showV(self.v_s)
        for i in range(10000):
            delta = 0
            lastV = self.v_s.copy()  # 上一个V（s）
            for s in self.states:
                if s in self.terminate_states:
                    continue

                v = lastV[s]

                action = self.pi_s[s]
                nextV = 0.0
                for a in action:
                    s_, r = self.action(s, a)
                    nextV += (r + self.gamma * lastV[s_]) / 4
                self.v_s[s] = nextV
                l1 = [delta, abs(v - self.v_s[s])]
                delta = max(l1)
            print("------- k = %d --------" % (i + 1))
            self.showV(self.v_s)
            if delta < theta:
                break

    def showV(self, v_s):
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[0], v_s[1], v_s[2], v_s[3]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[4], v_s[5], v_s[6], v_s[7]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[8], v_s[9], v_s[10], v_s[11]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[12], v_s[13], v_s[14], v_s[15]))
        print("-----------------------")

    def policy_improvement(self):
        policy_stable = True
        for s in self.states:
            if s in self.terminate_states:
                continue

            old_action = self.pi_optimal[s]
            max_v = float('-inf')
            for a in self.actions:
                s_, r = self.action(s, a)
                v = (r + self.gamma * self.v_s[s_]) / 4
                if v > max_v:
                    max_v = v
                    self.pi_optimal[s] = [a]
                if v == max_v and a not in self.pi_optimal[s]:
                    self.pi_optimal[s].append(a)

            if old_action != self.pi_optimal[s]:
                policy_stable = False
            print(s, self.pi_optimal[s])
        return policy_stable


if __name__ == '__main__':
    env = GridWorld(4)
    theta = 0.00001

    for _ in range(100):
        # policy evaluation
        env.policy_evaluation(theta)

        # policy improvement
        policy_stable = env.policy_improvement()
        print(policy_stable)

        if policy_stable:
            break

Value Iteration算法实现

在这里插入图片描述

class GridWorld:
    def __init__(self, number):
        self.states = [i for i in range(number**2)]
        self.terminate_states = [0, number**2-1]
        self.actions = ['up', 'down', 'left', 'right']
        self.gamma = 1

        # 初始化V（s）
        self.v_s = dict()   # dict() 函数用于创建一个字典
        for s in self.states:
            self.v_s[s] = 0.0

        # 初始化pi(s)
        self.pi_s = dict()
        for s in self.states:
            self.pi_s[s] =[]
            if s in self.terminate_states:
                continue
            self.pi_s[s].append("up")
            self.pi_s[s].append("down")
            self.pi_s[s].append("left")
            self.pi_s[s].append("right")

        # 初始化最优值函数 pi_optimal(s)
        self.pi_optimal = self.pi_s.copy()

    def state_to_grid(self, s):
        # 将状态转换成网格中的坐标位置
        row = s // 4
        col = s % 4
        return row, col

    def grid_to_state(self, row, col):
        return row*4 + col

    def action(self, s, action):
        if s in self.terminate_states:
            return s, 0
        row, col = self.state_to_grid(s)
        reward = -1
        if action == "up":
            row = row - 1
            if row < 0: row = 0
        if action == "down":
            row = row + 1
            if row > 3: row = 3
        if action == "left":
            col = col - 1
            if col < 0: col = 0
        if action == "right":
            col = col + 1
            if col > 3: col = 3
        state_ = self.grid_to_state(row, col)
        return state_, reward

    def value_iteration(self, theta):
        print("------- k = 0 --------")
        self.showV(self.v_s)
        for i in range(10000):
            delta = 0
            lastV = self.v_s.copy()  # 上一个V（s）
            for s in self.states:
                if s in self.terminate_states:
                    continue

                v = lastV[s]

                action = self.pi_s[s]
                nextMaxV = float('-inf')
                for a in action:
                    s_, r = self.action(s, a)
                    nextV = (r + self.gamma * lastV[s_]) / 4
                    if nextV > nextMaxV:
                        nextMaxV = nextV
                        self.pi_optimal[s] = [a]
                    if nextV == nextMaxV and a not in self.pi_optimal[s]:
                        self.pi_optimal[s].append(a)

                self.v_s[s] = nextMaxV
                l1 = [delta, abs(v - self.v_s[s])]
                delta = max(l1)
            print("------- k = %d --------" % (i + 1))
            self.showV(self.v_s)
            if delta < theta:
                print("------- output an optimal policy --------")
                print(self.pi_optimal)
                break

    def showV(self, v_s):
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[0], v_s[1], v_s[2], v_s[3]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[4], v_s[5], v_s[6], v_s[7]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[8], v_s[9], v_s[10], v_s[11]))
        print('%.2f, %.2f, %.2f, %.2f' % (v_s[12], v_s[13], v_s[14], v_s[15]))
        print("-----------------------")


if __name__ == '__main__':
    env = GridWorld(4)
    theta = 0.00001

    # value iteration
    env.value_iteration(theta)

Jacob Jiang

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
4.1、DP——Policy Iteration和Value Iteration代码实现

Policy Iteration算法实现：gridworld环境策略迭代结果：class GridWorld: def __init__(self, number): self.states = [i for i in range(number**2)] self.terminate_states = [0, number**2-1] ...
复制链接

扫一扫