RL: 一个可扩展性很好的环境

22 篇文章 27 订阅
这篇博客介绍了如何基于ELegantRL库创建可扩展的自定义强化学习环境。ELegantRL提供了一个用于环境预处理的类PreprocessEnv,该类能够进行数据类型转换、状态归一化以及行动值范围调整。同时,它包含了获取Gym环境信息的函数,用于获取如环境名称、状态维度、动作维度等关键参数。此外,还提供了状态平均值和标准差的计算方法,用于环境状态的标准化处理。
摘要由CSDN通过智能技术生成

自定义强化学习环境可以在以下基础上建立,可扩展性很好
详情可参考ELegant RL :https://github.com/AI4Finance-LLC/ElegantRL
ELegant RL

  1. 定义环境:(打印环境信息,数据类型float32)
  2. 获取环境信息,共7个量(str, int, int, int, int, bool, float)- > (env_name, state_dim, action_dim, action_max, max_step, if_discrete, target_return)
  3. 每个episode后的环境重置,状态归一化
import os
import numpy as np
import gym
import random as rd

gym.logger.set_level(40)

class PreprocessEnv(gym.Env):
    def __init__(self, env, if_print=True, data_type=np.float32):
        self.env = gym.make(env) if isinstance(env, str) else env
        super(PreprocessEnv, self).__init__(self.env)
        self.data_type = data_type

        (self.env_name, self.state_dim, self.action_dim, self.action_max, self.max_step,
        self.if_discrete, self.target_return) = get_gym_env_info(env, if_print)

        state_avg, state_std = get_gym_env_avg_std(self.env_name)
        if state_avg is not None:
            self.neg_state_avg = -state_avg
            self.div_state_std = 1 / (state_std + 1e-4)

            self.reset = self.reset_norm
            self.step = self.step_norm

        else:
            self.reset = self.reset_type
            self.step = self.step_type

    def reset_type(self) -> np.ndarray:
        """ state = env.reset()
        convert the data type of state from float64 to float32
        :return array state: state.shape==(state_dim, )
        """
        state = self.env.reset()
        return state.astype(self.data_type)

    def reset_norm(self) -> np.ndarray:
        """ state = env.reset()
        convert the data type of state from float64 to float32
        do normalization on state
        :return array state: state.shape==(state_dim, )
        """
        state = self.env.reset()
        state = (state + self.neg_state_avg) * self.div_state_std
        return state.astype(self.data_type)

    def step_type(self, action) -> (np.ndarray, float, bool, dict):
        """ next_state, reward, done = env.step(action)
        convert the data type of state from float64 to float32,
        adjust action range to (-action_max, +action_max)
        :return array state:  state.shape==(state_dim, )
        :return float reward: reward of one step
        :return bool  done  : the terminal of an training episode
        :return dict  info  : the information save in a dict. OpenAI gym standard. Send a `None` is OK
        """
        state, reward, done, info = self.env.step(action * self.action_max)
        return state.astype(self.data_type), reward, done, info

    def step_norm(self, action) -> (np.ndarray, float, bool, dict):
        state, reward, done, info = self.env.step(action * self.action_max)
        state = (state + self.neg_state_avg) * self.div_state_std
        return state.astype(self.data_type), reward, done, info

def get_avg_std__for_state_norm(env_name) -> (np.ndarray, np.ndarray):
    """return the state normalization data: neg_avg and div_std
    ReplayBuffer.print_state_norm() will print `neg_avg` and `div_std`
    You can save these array to here. And PreprocessEnv will load them automatically.
    eg. `state = (state + self.neg_state_avg) * self.div_state_std` in `PreprocessEnv.step_norm()`
    neg_avg = -states.mean()
    div_std = 1/(states.std()+1e-5) or 6/(states.max()-states.min())
    :str env_name: the name of environment that helps to find neg_avg and div_std
    :return array avg: neg_avg.shape=(state_dim)
    :return array std: div_std.shape=(state_dim)
    """
    avg = None
    std = None
    if env_name == 'LunarLanderContinuous-v2':
        avg = np.array([1.65470898e-02, -1.29684399e-01, 4.26883133e-03, -3.42124557e-02,
                        -7.39076972e-03, -7.67103031e-04, 1.12640885e+00, 1.12409466e+00])
        std = np.array([0.15094465, 0.29366297, 0.23490797, 0.25931464, 0.21603736,
                        0.25886878, 0.277233, 0.27771219])
    return avg, std

def get_gym_env_info(env, if_print) -> (str, int, int, int, int, bool, float):
    gym.logger.set_level(40)
    assert isinstance(env, gym.Env)
    env_name = env.unwrapped.spec.id

    state_shape = env.observation_space.shape
    state_dim = state_shape[0] if len(state_shape) == 1 else state_shape

    target_return = getattr(env, 'target_return', None)
    target_return_default = getattr(env, 'reward_threshold', None)
    if target_return is None:
        target_return = target_return_default
    if target_return is None:
        target_return = 2**16

    max_step = getattr(env, 'max_step', None)
    max_step_default = getattr(env, '_max_episode_step', None)
    if max_step is None:
        max_step = max_step_default
    if max_step is None:
        max_step = 2 ** 10

    if_discrete = isinstance(env.action_space, gym.spaces.Discrete)
    if if_discrete:
        action_dim = env.action_space.n
        action_max = int(1)
    elif isinstance(env.action_space, gym.spaces.Box):
        action_dim = env.action_space[0]
        action_max = float(env.action_space.high[0])
        assert not any(env.action_space.high + env.action_space.low)

    else:
        raise RuntimeError('please set these value manually: if_discrete=bool, action_dim=int, action_max=1.0')

    print(f"\n|env_name: {env_name}, state_dim: {state_dim},"
          f"\n|action_dim: {action_dim}, action_max: {action_max},"
          f"\n|max_step: {max_step}, if_discrete: {if_discrete},"
          f"\n|target_return:{target_return}")

    return env_name, state_dim, action_dim, action_max, max_step, if_discrete, target_return









评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值