简述
本文主要讲述 rllib中policy模块 的属性,以及如何自定义policy。 因为model是policy中的一个小的模块,因此我们会先讲述model模块。 另外,由于rllib在使用policy时会区分tensorflow/torch框架,因此我们会将其分别讲解。 (现讲解torch模块,tensorflow之后再补充)
Model
我们在自定义自己的模型时,我们都是基于TorchModelV2或者TFModelV2进行的,而这两个模块都是基于抽象类ModelV2。
ModelV2 基本类
class ModelV2:
# 初始化
def __init__(
self,
obs_space: Space, # gym.Space
action_space: Space, # gym,.Space
num_outputs: int, # 标识model每次的输出
model_config: ModelConfigDict, # model config
name: str, # model name 自定义
framework: str, # tf tf2 torch
)
我们应该根据需要定义自己的 base_model,然后在forward中进行调用
# base_model: tensorflow 可以用keras定义;pytorch则使用pytorch定义。 (后附例子)
def forward(
self,
input_dict: Dict[str, TensorType], # 其中包括 "obs" , "obs_flatten"
state: List[TensorType], 一般为None
seq_lens: TensorType, None
) -> (TensorType, List[TensorType]): # "obs_flatten"表示数据被压缩成1维,直接可以进行MLP;如果有设置CNN,应该使用obs
# 从sample batch传入的数据是经过 preprocessor之后的数据,在 modelV2 的__call__之后,已经有了 obs, obs_flatten 无需自己转换维度
@PublicAPI
def custom_loss(
self, policy_loss: TensorType, loss_inputs: Dict[str, TensorType]
) -> Union[List[TensorType], TensorType]:
# 在policy中已经产生了loss,如果我们需要修改loss的化,可以在这里修改loss
def metrics(self) -> Dict[str, TensorType]:
return {“own_metrics”:} # 返回自己需要的值
def value_function(self) -> TensorType:
### 必须在 forward运行之后执行,而且 只能将forward产生的结果 在此处输出
TFModelV2类
TFModelV2,相比 ModelV2 多了几个属性,对于模型更新无意义。主要介绍 如何基于TFModelV2建立 自定义模型
class MyModelClass(TFModelV2):
def __init__(self, obs_space, action_space, num_outputs, model_config, name):
super(MyModelClass, self).__init__(obs_space, action_space, num_outputs, model_config, name)
# 可以根据 model_config中 自己定义的参数 来定义自己想要的模型结构。
# 对于 actor-critic 一般包含两方面 价值 + policy
# 对于 DQN, 一般只包含 价值
input_layer = tf.keras.layers.Input(...)
hidden_layer = tf.keras.layers.Dense(...)(input_layer)
output_layer = tf.keras.layers.Dense(...)(hidden_layer)
value_layer = tf.keras.layers.Dense(...)(hidden_layer)
self.base_model = tf.keras.Model(
input_layer, [output_layer, value_layer])
def forward(self, input_dict, state, seq_lens):
model_out = self.base_model(input_dict["obs"]) # 还是选择 input_dict["obs_flatten"]
return model_out, state
def metrics(self):
pass
def custom_loss(
self, policy_loss: TensorType, loss_inputs: Dict[str, TensorType]
) -> Union[List[TensorType], TensorType]:
pass
def value_function(self) -> TensorType:
TorchModelV2
MyModelClass(TorchModelV2, nn.Module): # 需要包含 nn.Module
def __init__(self, obs_space, action_space, num_outputs, model_config, name):
TorchModelV2.__init__(obs_space, action_space, num_outputs, model_config, name)
nn.Module.__init__(self)
self._hidden_layers = nn.Sequential(...)
self._logits = # policy output
self._value_branch = # value output
def forward(self, input_dict, state, seq_lens):
policy = self._logits(input_dict)
value = self._value_branch(input_dict["obs"]) # 还是选择 input_dict["obs_flatten"]
return [policy,value], state
如何使用自己定义的模型
需要将自己定义的模型进行注册,然后在model_config 里的"custome_model"中就可以进行定义
from ray.rllib.models import ModelCatalog
ModelCatalog.register_custom_model("MYmodel", MyModelClass) # 完成注册
在 tune/ trainer中
"model":{ "custom_model":"MYmodel" }
Model config
MODEL_DEFAULTS: ModelConfigDict = {
# Experimental flag.
# If True, try to use a native (tf.keras.Model or torch.Module) default
# model instead of our built-in ModelV2 defaults.
# If False (default), use "classic" ModelV2 default models.
# Note that this currently only works for:
# 1) framework != torch AND
# 2) fully connected and CNN default networks as well as
# auto-wrapped LSTM- and attention nets.
"_use_default_native_models": False,
# Experimental flag.
# If True, user specified no preprocessor to be created
# (via config._disable_preprocessor_api=True). If True, observations
# will arrive in model as they are returned by the env.
"_disable_preprocessor_api": False,
# Experimental flag.
# If True, RLlib will no longer flatten the policy-computed actions into
# a single tensor (for storage in SampleCollectors/output files/etc..),
# but leave (possibly nested) actions as-is. Disabling flattening affects:
# - SampleCollectors: Have to store possibly nested action structs.
# - Models that have the previous action(s) as part of their input.
# - Algorithms reading from offline files (incl. action information).
"_disable_action_flattening": False,
# === Built-in options ===
# FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py
# These are used if no custom model is specified and the input space is 1D.
# Number of hidden layers to be used.
"fcnet_hiddens": [256, 256],
# Activation function descriptor.
# Supported values are: "tanh", "relu", "swish" (or "silu"),
# "linear" (or None).
"fcnet_activation": "tanh",
# VisionNetwork (tf and torch): rllib.models.tf|torch.visionnet.py
# These are used if no custom model is specified and the input space is 2D.
# Filter config: List of [out_channels, kernel, stride] for each filter.
# Example:
# Use None for making RLlib try to find a default filter setup given the
# observation space.
"conv_filters": None,
# Activation function descriptor.
# Supported values are: "tanh", "relu", "swish" (or "silu"),
# "linear" (or None).
"conv_activation": "relu",
# Some default models support a final FC stack of n Dense layers with given
# activation:
# - Complex observation spaces: Image components are fed through
# VisionNets, flat Boxes are left as-is, Discrete are one-hot'd, then
# everything is concated and pushed through this final FC stack.
# - VisionNets (CNNs), e.g. after the CNN stack, there may be
# additional Dense layers.
# - FullyConnectedNetworks will have this additional FCStack as well
# (that's why it's empty by default).
"post_fcnet_hiddens": [],
"post_fcnet_activation": "relu",
# For DiagGaussian action distributions, make the second half of the model
# outputs floating bias variables instead of state-dependent. This only
# has an effect is using the default fully connected net.
"free_log_std": False,
# Whether to skip the final linear layer used to resize the hidden layer
# outputs to size `num_outputs`. If True, then the last hidden layer
# should already match num_outputs.
# no_final_linear ==True 时,我们使用给定的model会生成的 num_output 不等于 action_space.n
# 我们可以在 model_interface界面里生成后续的模型,从而形成一个完整的模型
"no_final_linear": False,
# Whether layers should be shared for the value function.
"vf_share_layers": True,
# == LSTM ==
# Whether to wrap the model with an LSTM.
"use_lstm": False,
# Max seq len for training the LSTM, defaults to 20.
"max_seq_len": 20,
# Size of the LSTM cell.
"lstm_cell_size": 256,
# Whether to feed a_{t-1} to LSTM (one-hot encoded if discrete).
"lstm_use_prev_action": False,
# Whether to feed r_{t-1} to LSTM.
"lstm_use_prev_reward": False,
# Whether the LSTM is time-major (TxBx..) or batch-major (BxTx..).
"_time_major": False,
# == Attention Nets (experimental: torch-version is untested) ==
# Whether to use a GTrXL ("Gru transformer XL"; attention net) as the
# wrapper Model around the default Model.
"use_attention": False,
# The number of transformer units within GTrXL.
# A transformer unit in GTrXL consists of a) MultiHeadAttention module and
# b) a position-wise MLP.
"attention_num_transformer_units": 1,
# The input and output size of each transformer unit.
"attention_dim": 64,
# The number of attention heads within the MultiHeadAttention units.
"attention_num_heads": 1,
# The dim of a single head (within the MultiHeadAttention units).
"attention_head_dim": 32,
# The memory sizes for inference and training.
"attention_memory_inference": 50,
"attention_memory_training": 50,
# The output dim of the position-wise MLP.
"attention_position_wise_mlp_dim": 32,
# The initial bias values for the 2 GRU gates within a transformer unit.
"attention_init_gru_gate_bias": 2.0,
# Whether to feed a_{t-n:t-1} to GTrXL (one-hot encoded if discrete).
"attention_use_n_prev_actions": 0,
# Whether to feed r_{t-n:t-1} to GTrXL.
"attention_use_n_prev_rewards": 0,
# == Atari ==
# Set to True to enable 4x stacking behavior.
"framestack": True,
# Final resized frame dimension
"dim": 84,
# (deprecated) Converts ATARI frame to 1 Channel Grayscale image
"grayscale": False,
# (deprecated) Changes frame to range from [-1, 1] if true
"zero_mean": True,
# === Options for custom models ===
# Name of a custom model to use
"custom_model": None,
# Extra options to pass to the custom classes. These will be available to
# the Model's constructor in the model_config field. Also, they will be
# attempted to be passed as **kwargs to ModelV2 models. For an example,
# see rllib/models/[tf|torch]/attention_net.py.
"custom_model_config": {},
# Name of a custom action distribution to use.
"custom_action_dist": None,
# Custom preprocessors are deprecated. Please use a wrapper class around
# your environment instead to preprocess observations.
"custom_preprocessor": None,
# Deprecated keys:
# Use `lstm_use_prev_action` or `lstm_use_prev_reward` instead.
"lstm_use_prev_action_reward": DEPRECATED_VALUE,
}
在trainer中 可以通过model来传递参数
algo_config = {
# All model-related settings go into this sub-dict.
"model": {
# By default, the MODEL_DEFAULTS dict above will be used.
# Change individual keys in that dict by overriding them, e.g.
"fcnet_hiddens": [512, 512, 512],
"fcnet_activation": "relu",
},
# ... other Trainer config keys, e.g. "lr" ...
"lr": 0.00001,
}
policy
我们在model中定义了模型结构。 但我们还没有定义 RL的学习loss,也没有根据sample batch得到的数据 进行轨迹的后处理,或者生成环境模型 等等。 而这些,都是在policy中进行定义。因此,在policy中需要的config和 trainer中的config 是一致的。
基本类 Policy
class ray.rllib.policy.policy.Policy():
def __init__(self,observation_space, action_space, config):
# config 是 trainer / policy的config
# 定义 model,optimizer,RL等等
def compute_single_action(obs: Optional[Union[Any, dict, tuple]] = None,
state: Optional[List[Any]] = None, *,
prev_action: Optional[Union[Any, dict, tuple]] = None,
prev_reward: Optional[Union[Any, dict, tuple]] = None,
info: dict = None,
input_dict: Optional[ray.rllib.policy.sample_batch.SampleBatch] = None,
episode: Optional[Episode] = None,
explore: Optional[bool] = None,
timestep: Optional[int] = None,
**kwargs) -> Tuple[Union[Any, dict, tuple], List[Any], Dict[str, Any]]:
# 计算单个obs 输出的结果
def compute_actions_from_input_dict(
self,
input_dict: Union[SampleBatch, Dict[str, TensorStructType]],
explore: bool = None,
timestep: Optional[int] = None,
episodes: Optional[List["Episode"]] = None,
**kwargs,
) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
# 从 inpput_dict 计算动作
!!!!函数都是转换到下边的 compute_actions 进行计算的!
def compute_actions(
self,
obs_batch: Union[List[TensorStructType], TensorStructType],
state_batches: Optional[List[TensorType]] = None,
prev_action_batch: Union[List[TensorStructType], TensorStructType] = None,
prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None,
info_batch: Optional[Dict[str, list]] = None,
episodes: Optional[List["Episode"]] = None,
explore: Optional[bool] = None,
timestep: Optional[int] = None,
**kwargs,
) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
# 这是需要定义的部分
def compute_log_likelihoods(
self,
actions: Union[List[TensorType], TensorType],
obs_batch: Union[List[TensorType], TensorType],
state_batches: Optional[List[TensorType]] = None,
prev_action_batch: Optional[Union[List[TensorType], TensorType]] = None,
prev_reward_batch: Optional[Union[List[TensorType], TensorType]] = None,
actions_normalized: bool = True,
) -> TensorType:
# 定义 log pi
def postprocess_trajectory(
self,
sample_batch: SampleBatch,
other_agent_batches: Optional[
Dict[AgentID, Tuple["Policy", SampleBatch]]
] = None,
episode: Optional["Episode"] = None,
) -> SampleBatch:
# 后处理 轨迹
def loss(
self, model: ModelV2, dist_class: ActionDistribution, train_batch: SampleBatch
) -> Union[TensorType, List[TensorType]]:
# 定义 RL loss
# 定义gradient
def compute_gradients(
self, postprocessed_batch: SampleBatch
) -> Tuple[ModelGradients, Dict[str, TensorType]]:
# 应用gradient
def apply_gradients(self, gradients: ModelGradients) -> None:
# 将上述 compute_gradients 和 apply_gradients实现后 就可以直接使用
def learn_on_batch(self, samples: SampleBatch) -> Dict[str, TensorType]:
# get weights
def get_weights(self) -> ModelWeights:
# set weights
def set_weights(self, weights: ModelWeights) -> None:
# get state
def get_state(self) -> PolicyState:
# set state
def set_state(self, state: PolicyState) -> None:
def export_model(self, export_dir: str, onnx: Optional[int] = None) -> None:
def import_model_from_h5(self, import_file: str) -> None:
# 与 experience replay 有关
def load_batch_into_buffer(self, batch: SampleBatch, buffer_index: int = 0) -> int:
# 将经验 存储在 存储器(experience replay pool)中
# 从经验池中学习
def learn_on_batch_from_replay_buffer(
self, replay_actor: ActorHandle, policy_id: PolicyID
) -> Dict[str, TensorType]:
def learn_on_loaded_batch(self, offset: int = 0, buffer_index: int = 0):
TorchPolicy
我们在创建torchPolicy时是根据以上的框架进行修改函数,因此一方面 我们可以根据需要替换 需要修改的函数,然后使用build_policy_class函数创建对应的policy;另一方面也可以直接建立policy类。
如果我们创建每个需要的函数,我们可以根据自己的需要多次进行替换,比较方便。
根据函数创建policy
from ray.rllib.policy.torch_policy_template import build_torch_policy
MyTorchPolicy = build_torch_policy( # 此处创建的是一个 polciy类
name="MyTorchPolicy", # 定义名字
get_default_config= config, # config, 自己定义的config,是一个类
loss_fn=actor_critic_loss, # 和上述的 loss 函数相同,只不过定义函数时 把self 换成policy 变量就可以
stats_fn=loss_and_entropy_stats, # 定义一些会用到的stats
postprocess_fn=add_advantages,
extra_action_out_fn=model_value_predictions,
extra_grad_process_fn=apply_grad_clipping,
optimizer_fn=torch_optimizer,
mixins=[ValueNetworkMixin])
def torch_optimizer(policy):
return 返回一个optimizer
#### 根据 自己创建的policy进行学习
import ray
from ray import tune
from ray.rllib.algorithms.algorithm import Algorithm
# Create a new Algorithm using the Policy defined above.
class MyAlgorithm(Algorithm):
def get_default_policy_class(self, config):
return MyTorchPolicy
tune.run(
MyAlgorithm,
stop={"training_iteration": args.stop_iters},
config={
"env": "CartPole-v0",
# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
"num_workers": 2,
"framework": "torch",
},
)
直接创建对应的policy
根据已经创建好的policy创建
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
CustomPolicy = PPOTFPolicy.with_updates(
name="MyCustomPPOTFPolicy",
loss_fn=some_custom_loss_fn)
CustomTrainer = PPOTrainer.with_updates(
default_policy=CustomPolicy)
后续
后续会创建常见算法的 自己写的policy算法。
https://docs.ray.io/en/latest/rllib/package_ref/policy/custom_policies.html
https://zhuanlan.zhihu.com/p/80204897
trainer和policy的关系
trainer 包含和环境交互需要的所有policy(单policy,多policy) 以及 会生成 worker(worker就是生成的env,用于生成samples)。 trainer的结构会在下文中说明。