自定义强化学习环境可以在以下基础上建立,可扩展性很好
详情可参考ELegant RL :https://github.com/AI4Finance-LLC/ElegantRL
ELegant RL
- 定义环境:(打印环境信息,数据类型float32)
- 获取环境信息,共7个量(str, int, int, int, int, bool, float)- > (env_name, state_dim, action_dim, action_max, max_step, if_discrete, target_return)
- 每个episode后的环境重置,状态归一化
import os
import numpy as np
import gym
import random as rd
gym.logger.set_level(40)
class PreprocessEnv(gym.Env):
def __init__(self, env, if_print=True, data_type=np.float32):
self.env = gym.make(env) if isinstance(env, str) else env
super(PreprocessEnv, self).__init__(self.env)
self.data_type = data_type
(self.env_name, self.state_dim, self.action_dim, self.action_max, self.max_step,
self.if_discrete, self.target_return) = get_gym_env_info(env, if_print)
state_avg, state_std = get_gym_env_avg_std(self.env_name)
if state_avg is not None:
self.neg_state_avg = -state_avg
self.div_state_std = 1 / (state_std + 1e-4)
self.reset = self.reset_norm
self.step = self.step_norm
else:
self.reset = self.reset_type
self.step = self.step_type
def reset_type(self) -> np.ndarray:
""" state = env.reset()
convert the data type of state from float64 to float32
:return array state: state.shape==(state_dim, )
"""
state = self.env.reset()
return state.astype(self.data_type)
def reset_norm(self) -> np.ndarray:
""" state = env.reset()
convert the data type of state from float64 to float32
do normalization on state
:return array state: state.shape==(state_dim, )
"""
state = self.env.reset()
state = (state + self.neg_state_avg) * self.div_state_std
return state.astype(self.data_type)
def step_type(self, action) -> (np.ndarray, float, bool, dict):
""" next_state, reward, done = env.step(action)
convert the data type of state from float64 to float32,
adjust action range to (-action_max, +action_max)
:return array state: state.shape==(state_dim, )
:return float reward: reward of one step
:return bool done : the terminal of an training episode
:return dict info : the information save in a dict. OpenAI gym standard. Send a `None` is OK
"""
state, reward, done, info = self.env.step(action * self.action_max)
return state.astype(self.data_type), reward, done, info
def step_norm(self, action) -> (np.ndarray, float, bool, dict):
state, reward, done, info = self.env.step(action * self.action_max)
state = (state + self.neg_state_avg) * self.div_state_std
return state.astype(self.data_type), reward, done, info
def get_avg_std__for_state_norm(env_name) -> (np.ndarray, np.ndarray):
"""return the state normalization data: neg_avg and div_std
ReplayBuffer.print_state_norm() will print `neg_avg` and `div_std`
You can save these array to here. And PreprocessEnv will load them automatically.
eg. `state = (state + self.neg_state_avg) * self.div_state_std` in `PreprocessEnv.step_norm()`
neg_avg = -states.mean()
div_std = 1/(states.std()+1e-5) or 6/(states.max()-states.min())
:str env_name: the name of environment that helps to find neg_avg and div_std
:return array avg: neg_avg.shape=(state_dim)
:return array std: div_std.shape=(state_dim)
"""
avg = None
std = None
if env_name == 'LunarLanderContinuous-v2':
avg = np.array([1.65470898e-02, -1.29684399e-01, 4.26883133e-03, -3.42124557e-02,
-7.39076972e-03, -7.67103031e-04, 1.12640885e+00, 1.12409466e+00])
std = np.array([0.15094465, 0.29366297, 0.23490797, 0.25931464, 0.21603736,
0.25886878, 0.277233, 0.27771219])
return avg, std
def get_gym_env_info(env, if_print) -> (str, int, int, int, int, bool, float):
gym.logger.set_level(40)
assert isinstance(env, gym.Env)
env_name = env.unwrapped.spec.id
state_shape = env.observation_space.shape
state_dim = state_shape[0] if len(state_shape) == 1 else state_shape
target_return = getattr(env, 'target_return', None)
target_return_default = getattr(env, 'reward_threshold', None)
if target_return is None:
target_return = target_return_default
if target_return is None:
target_return = 2**16
max_step = getattr(env, 'max_step', None)
max_step_default = getattr(env, '_max_episode_step', None)
if max_step is None:
max_step = max_step_default
if max_step is None:
max_step = 2 ** 10
if_discrete = isinstance(env.action_space, gym.spaces.Discrete)
if if_discrete:
action_dim = env.action_space.n
action_max = int(1)
elif isinstance(env.action_space, gym.spaces.Box):
action_dim = env.action_space[0]
action_max = float(env.action_space.high[0])
assert not any(env.action_space.high + env.action_space.low)
else:
raise RuntimeError('please set these value manually: if_discrete=bool, action_dim=int, action_max=1.0')
print(f"\n|env_name: {env_name}, state_dim: {state_dim},"
f"\n|action_dim: {action_dim}, action_max: {action_max},"
f"\n|max_step: {max_step}, if_discrete: {if_discrete},"
f"\n|target_return:{target_return}")
return env_name, state_dim, action_dim, action_max, max_step, if_discrete, target_return