R-learning sarsa 模型的测试

最新推荐文章于 2024-06-12 00:32:20 发布

廷益--飞鸟

最新推荐文章于 2024-06-12 00:32:20 发布

阅读量308

点赞数

分类专栏：强化学习

本文链接：https://blog.csdn.net/weixin_45875105/article/details/116269167

版权

强化学习专栏收录该内容

8 篇文章 1 订阅

订阅专栏

在这里插入图片描述

核心代码

    def sample(self, obs):
        # 根据输入观察值 生成一个动作输出 带探索
        greed = np.random.uniform(0, 1) < (1 - self.epsilon)
        if greed:
            action = self.predict(obs)
        else:
            action = np.random.choice(self.act_n)
        return action

    def predict(self, obs):
        # 获取表中列表
        obs_lst = self.Q[obs, :]  # 状态列表

        # 从列表中选择动作最大值（可能有几个一样大）
        obs_max = np.max(obs_lst)
        max_action_lst = np.where(obs_lst == obs_max)[0]  # 获取最大索引（动作）

        action = np.random.choice(max_action_lst)  # 随机选择 动作
        return action

    # 学习 更新Q表格
    def learn(self, obs, action, reward, next_obs, next_action, done):
        """ on-policy
            obs: 交互前的obs, s_t
            action: 本次交互选择的action, a_t
            reward: 本次动作获得的奖励r
            next_obs: 本次交互后的obs, s_t+1
            next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
            done: episode是否结束
        """
        # 获取当前值
        obs_act_q = self.Q[obs, action]

        # 计算 下次目标值
        if done:
            target_q = reward
        else:
            target_q = reward + self.gamma * self.Q[next_obs, next_action]
            # sarsa s:当前状态 a 当前动作 r: 反馈  s:下个一个状态 a：下一个动作

        # 更新当前值
        self.Q[obs, action] += self.lr * (target_q - obs_act_q)

完整代码

主程序

import gym

from gridworld import CliffWalkingWapper
from sarsa_agent import SarsaAgent
import time
# 运行
def run_episode(env, agent, render=False):
    total_steps = 0
    total_reward = 0

    obs = env.reset()
    action = agent.sample(obs)  # 选择一个动作

    while True:
        # 获取 s = obs ; a = action r = reward s = next_obs; a = next_action
        next_obs, reward, done, _ = env.step(action)

        # 有一定几率 随机动作
        next_action = agent.sample(next_obs)  # 0 up, 1 right, 2 down, 3 left

        # print('学习数据', obs, action, reward, next_obs, next_action, done)
        agent.learn(obs, action, reward, next_obs, next_action, done)

        action = next_action
        obs = next_obs
        total_reward += reward
        total_steps += 1
        if render:
            env.render()
        if done:
            break
    return total_reward, total_steps


def test_episode(env, agent):
    total_reward = 0
    obs = env.reset()

    while True:
        action = agent.predict(obs)  # greedy

        next_obs, reward, done, _ = env.step(action)
        total_reward += reward
        obs = next_obs
        time.sleep(0.5)
        env.render()
        if done:
            break
    return total_reward


if __name__ == '__main__':
    env = gym.make('CliffWalking-v0')
    env = CliffWalkingWapper(env)

    # 创建一个agent实例，输入超参数
    agent = SarsaAgent(
        obs_n=env.observation_space.n,
        act_n=env.action_space.n,
        learning_rate=0.1,
        gamma=0.9,
        e_greed=0.1)

    # 训练500个episode，打印每个20 episode的分数
    for episode in range(500):
        render = True if episode % 20 == 0 else False
        ep_reward, ep_steps = run_episode(env, agent, render)
        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, ep_reward))

    # 全部训练结束，查看算法效果
    test_reward = test_episode(env, agent)
    print('test reward = %.1f' % test_reward)

    agent.save()

sarsa_agent.py

import numpy as np


class SarsaAgent(object):
    def __init__(self, obs_n, act_n, learning_rate=0.01, gamma=0.9, e_greed=0.1):
        self.act_n = act_n  # 动作的可能性
        self.obs_n = obs_n  # 环境状态数
        self.lr = learning_rate  # 学习率
        self.gamma = gamma  # 衰减率
        self.epsilon = e_greed  # 概率随机动作
        self.Q = np.zeros((obs_n, act_n))  # 生成Q表格

    def sample(self, obs):
        # 根据输入观察值 生成一个动作输出 带探索
        greed = np.random.uniform(0, 1) < (1 - self.epsilon)
        if greed:
            action = self.predict(obs)
        else:
            action = np.random.choice(self.act_n)
        return action

    # 预测输出的值
    def predict(self, obs):
        # 获取表中列表
        obs_lst = self.Q[obs, :]  # 状态列表

        # 从列表中选择动作最大值（可能有几个一样大）
        obs_max = np.max(obs_lst)
        max_action_lst = np.where(obs_lst == obs_max)[0]  # 获取最大索引（动作）

        action = np.random.choice(max_action_lst)  # 随机选择 动作
        return action

    # 学习 更新Q表格
    def learn(self, obs, action, reward, next_obs, next_action, done):
        """ on-policy
            obs: 交互前的obs, s_t
            action: 本次交互选择的action, a_t
            reward: 本次动作获得的奖励r
            next_obs: 本次交互后的obs, s_t+1
            next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
            done: episode是否结束
        """
        # 获取当前值
        obs_act_q = self.Q[obs, action]

        # 计算 下次目标值
        if done:
            target_q = reward
        else:
            target_q = reward + self.gamma * self.Q[next_obs, next_action]
            # sarsa s:当前状态 a 当前动作 r: 反馈  s:下个一个状态 a：下一个动作

        # 更新当前值
        self.Q[obs, action] += self.lr * (target_q - obs_act_q)

    def save(self):
        npy_file = './q_table.npy'
        np.save(npy_file, self.Q)
        print(npy_file + ' saved.')

    def restore(self, npy_file='./q_table.npy'):
        self.Q = np.load(npy_file)
        print(npy_file + ' loaded.')

gridworld.py

# -*- coding: utf-8 -*-

import gym
import turtle
import numpy as np

# turtle tutorial : https://docs.python.org/3.3/library/turtle.html


def GridWorld(gridmap=None, is_slippery=False):
    if gridmap is None:
        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
    env = FrozenLakeWapper(env)
    return env


class FrozenLakeWapper(gym.Wrapper):
    def __init__(self, env):
        gym.Wrapper.__init__(self, env)
        self.max_y = env.desc.shape[0]
        self.max_x = env.desc.shape[1]
        self.t = None
        self.unit = 50

    def draw_box(self, x, y, fillcolor='', line_color='gray'):
        self.t.up()
        self.t.goto(x * self.unit, y * self.unit)
        self.t.color(line_color)
        self.t.fillcolor(fillcolor)
        self.t.setheading(90)
        self.t.down()
        self.t.begin_fill()
        for _ in range(4):
            self.t.forward(self.unit)
            self.t.right(90)
        self.t.end_fill()

    def move_player(self, x, y):
        self.t.up()
        self.t.setheading(90)
        self.t.fillcolor('red')
        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)

    def render(self):
        if self.t == None:
            self.t = turtle.Turtle()
            self.wn = turtle.Screen()
            self.wn.setup(self.unit * self.max_x + 100,
                          self.unit * self.max_y + 100)
            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
                                        self.unit * self.max_y)
            self.t.shape('circle')
            self.t.width(2)
            self.t.speed(0)
            self.t.color('gray')
            for i in range(self.desc.shape[0]):
                for j in range(self.desc.shape[1]):
                    x = j
                    y = self.max_y - 1 - i
                    if self.desc[i][j] == b'S':  # Start
                        self.draw_box(x, y, 'white')
                    elif self.desc[i][j] == b'F':  # Frozen ice
                        self.draw_box(x, y, 'white')
                    elif self.desc[i][j] == b'G':  # Goal
                        self.draw_box(x, y, 'yellow')
                    elif self.desc[i][j] == b'H':  # Hole
                        self.draw_box(x, y, 'black')
                    else:
                        self.draw_box(x, y, 'white')
            self.t.shape('turtle')

        x_pos = self.s % self.max_x
        y_pos = self.max_y - 1 - int(self.s / self.max_x)
        self.move_player(x_pos, y_pos)


class CliffWalkingWapper(gym.Wrapper):
    def __init__(self, env):
        gym.Wrapper.__init__(self, env)
        self.t = None
        self.unit = 50
        self.max_x = 12
        self.max_y = 4

    def draw_x_line(self, y, x0, x1, color='gray'):
        assert x1 > x0
        self.t.color(color)
        self.t.setheading(0)
        self.t.up()
        self.t.goto(x0, y)
        self.t.down()
        self.t.forward(x1 - x0)

    def draw_y_line(self, x, y0, y1, color='gray'):
        assert y1 > y0
        self.t.color(color)
        self.t.setheading(90)
        self.t.up()
        self.t.goto(x, y0)
        self.t.down()
        self.t.forward(y1 - y0)

    def draw_box(self, x, y, fillcolor='', line_color='gray'):
        self.t.up()
        self.t.goto(x * self.unit, y * self.unit)
        self.t.color(line_color)
        self.t.fillcolor(fillcolor)
        self.t.setheading(90)
        self.t.down()
        self.t.begin_fill()
        for i in range(4):
            self.t.forward(self.unit)
            self.t.right(90)
        self.t.end_fill()

    def move_player(self, x, y):
        self.t.up()
        self.t.setheading(90)
        self.t.fillcolor('red')
        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)

    def render(self):
        if self.t == None:
            self.t = turtle.Turtle()
            self.wn = turtle.Screen()
            self.wn.setup(self.unit * self.max_x + 100,
                          self.unit * self.max_y + 100)
            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
                                        self.unit * self.max_y)
            self.t.shape('circle')
            self.t.width(2)
            self.t.speed(0)
            self.t.color('gray')
            for _ in range(2):
                self.t.forward(self.max_x * self.unit)
                self.t.left(90)
                self.t.forward(self.max_y * self.unit)
                self.t.left(90)
            for i in range(1, self.max_y):
                self.draw_x_line(
                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
            for i in range(1, self.max_x):
                self.draw_y_line(
                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)

            for i in range(1, self.max_x - 1):
                self.draw_box(i, 0, 'black')
            self.draw_box(self.max_x - 1, 0, 'yellow')
            self.t.shape('turtle')

        x_pos = self.s % self.max_x
        y_pos = self.max_y - 1 - int(self.s / self.max_x)
        self.move_player(x_pos, y_pos)


if __name__ == '__main__':
    # 环境1：FrozenLake, 可以配置冰面是否是滑的
    # 0 left, 1 down, 2 right, 3 up
    env = gym.make("FrozenLake-v0", is_slippery=False)
    env = FrozenLakeWapper(env)

    # 环境2：CliffWalking, 悬崖环境
    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    # env = CliffWalkingWapper(env)

    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
    # gridmap = [
    #         'SFFF',
    #         'FHFF',
    #         'FFFF',
    #         'HFGF' ]
    # env = GridWorld(gridmap)

    env.reset()
    for step in range(10):
        action = np.random.randint(0, 4)
        obs, reward, done, info = env.step(action)
        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
                step, action, obs, reward, done, info))
        # env.render() # 渲染一帧图像