一、使用测试
- 这里跑的是isaac gym官方的强化学习环境代码isaacgymenvs
下载链接:https://zhuanlan.zhihu.com/p/671309384
1、 运行命令和效果
- 训练命令
python train.py task=Cartpole #headless=True
运行倒立摆任务,运行一会就可以收敛。headless设置为True可以无图像输出
- 测试命令
python train.py task=Cartpole test=True checkpoint=./runs/Cartpole-21-22-15/nn/Cartpole.pth num_envs=2
test代表测试模式,num_envs是指定环境数量,checkpoint是模型的地址
二、train.py解读
1、hydra模块
@hydra.main(version_base="1.1", config_name="config", config_path="./cfg")
- 这个模块主要是从yaml文件中读取配置信息,是通过一个装饰器的形式实现的。不仅读取了./cfg/config.yaml文件,还读取了命令行中的配置信息,命令行中的配置信息回覆盖yaml中的配置。
2、基础参数字典配置
# ensure checkpoints can be specified as relative paths
if cfg.checkpoint:
cfg.checkpoint = to_absolute_path(cfg.checkpoint)
cfg_dict = omegaconf_to_dict(cfg)
print_dict("配置字典:",cfg_dict)
# set numpy formatting for printing only 配置numpy打印时候的格式
set_np_formatting()
# global rank of the GPU 获取进程rank号
global_rank = int(os.getenv("RANK", "0"))
# sets seed. if seed is -1 will pick a random one 设置种子
cfg.seed = set_seed(cfg.seed, torch_deterministic=cfg.torch_deterministic, rank=global_rank)
- 这部分代码就是在cfg的基础上进行一些修改
3、注册环境
def create_isaacgym_env(**kwargs): #创建isaac gym环境
envs = isaacgymenvs.make(
cfg.seed,
cfg.task_name,
cfg.task.env.numEnvs,
cfg.sim_device,
cfg.rl_device,
cfg.graphics_device_id,
cfg.headless,
cfg.multi_gpu,
cfg.capture_video,
cfg.force_render,
cfg,
**kwargs,
)
if cfg.capture_video:
envs.is_vector_env = True
envs = gym.wrappers.RecordVideo(
envs,
f"videos/{run_name}",
step_trigger=lambda step: step % cfg.capture_video_freq == 0,
video_length=cfg.capture_video_len,
)
return envs
#这里的注册了一个环境,叫做rlgpu。这个名字后来在CartpolePPO.yaml中设置了env_name
env_configurations.register('rlgpu', {
'vecenv_type': 'RLGPU1', #定义了一个向量环境的名字,这里是RLGPU1,要和下面的RLGPU1对应
'env_creator': lambda **kwargs: create_isaacgym_env(**kwargs),
})
# 下面是注册这个向量环境RLGPU1,RLGPU1应该是一个用于管理多环境的类
#从那个_init_文件中获得任务字典中的对应任务的环境类
ige_env_cls = isaacgym_task_map[cfg.task_name] #这里的输出就是对应的环境类了
dict_cls = ige_env_cls.dict_obs_cls if hasattr(ige_env_cls, 'dict_obs_cls') and ige_env_cls.dict_obs_cls else False #三元表达式
if dict_cls:
obs_spec = {}
actor_net_cfg = cfg.train.params.network
obs_spec['obs'] = {'names': list(actor_net_cfg.inputs.keys()), 'concat': not actor_net_cfg.name == "complex_net", 'space_name': 'observation_space'}
if "central_value_config" in cfg.train.params.config:
critic_net_cfg = cfg.train.params.config.central_value_config.network
obs_spec['states'] = {'names': list(critic_net_cfg.inputs.keys()), 'concat': not critic_net_cfg.name == "complex_net", 'space_name': 'state_space'}
vecenv.register('RLGPU1', lambda config_name, num_actors, **kwargs: ComplexObsRLGPUEnv(config_name, num_actors, obs_spec, **kwargs))
else:
vecenv.register('RLGPU1', lambda config_name, num_actors, **kwargs: RLGPUEnv(config_name, num_actors, **kwargs))
- 这里主要是配置了rlgpu,定义了其管理并行环境的方式和创建环境的函数。这个‘rlgpu’的名字是在强化学习任务的yaml里指定的。
4、补充配置字典
print(cfg)
print('11111111111111111111111111111111111')
rlg_config_dict = omegaconf_to_dict(cfg.train) #取出训练相关参数
print(rlg_config_dict)
rlg_config_dict = preprocess_train_config(cfg, rlg_config_dict)
print('11111111111111111111111111111111111')
print(rlg_config_dict)
def preprocess_train_config(cfg, config_dict):
"""
Adding common configuration parameters to the rl_games train config. 增加训练配置参数,增加的就是这下面的四个
An alternative to this is inferring them in task-specific .yaml files, but that requires repeating the same
variable interpolations in each config.
"""
train_cfg = config_dict['params']['config']
train_cfg['device'] = cfg.rl_device
train_cfg['population_based_training'] = cfg.pbt.enabled
train_cfg['pbt_idx'] = cfg.pbt.policy_idx if cfg.pbt.enabled else None
train_cfg['full_experiment_name'] = cfg.get('full_experiment_name')
# print(f'Using rl_device: {cfg.rl_device}')
# print(f'Using sim_device: {cfg.sim_device}')
# print(train_cfg)
try: #检查是否需要修改网络的单元数
model_size_multiplier = config_dict['params']['network']['mlp']['model_size_multiplier']
if model_size_multiplier != 1:
units = config_dict['params']['network']['mlp']['units']
for i, u in enumerate(units):
units[i] = u * model_size_multiplier
print(f'Modified MLP units by x{model_size_multiplier} to {config_dict["params"]["network"]["mlp"]["units"]}')
except KeyError:
pass
return config_dict
- 通过打印log可以看出,增加的实际上就是自定义函数中的这几个,应该是为了方便
5、定义观察者
observers = [RLGPUAlgoObserver()]
if cfg.pbt.enabled:
pbt_observer = PbtAlgoObserver(cfg)
observers.append(pbt_observer)
if cfg.wandb_activate:
cfg.seed += global_rank
if global_rank == 0:
# initialize wandb only once per multi-gpu run
wandb_observer = WandbAlgoObserver(cfg)
observers.append(wandb_observer)
# register new AMP network builder and agent
def build_runner(algo_observer):
runner = Runner(algo_observer)
#这里增加的几种算法其实没用上,是为别的算法准备的
runner.algo_factory.register_builder('amp_continuous', lambda **kwargs : amp_continuous.AMPAgent(**kwargs))
runner.player_factory.register_builder('amp_continuous', lambda **kwargs : amp_players.AMPPlayerContinuous(**kwargs))
model_builder.register_model('continuous_amp', lambda network, **kwargs : amp_models.ModelAMPContinuous(network))
model_builder.register_network('amp', lambda **kwargs : amp_network_builder.AMPBuilder())
return runner
- 添加了三个观察者,主要是为了后续runner使用做准备
6、配置runner
runner = build_runner(MultiObserver(observers))
runner.load(rlg_config_dict) #加载整体配置
runner.reset() #重置环境
# dump config dict 转存配置字典,就是每个实验log中的config.yaml文件
if not cfg.test:
experiment_dir = os.path.join('runs', cfg.train.params.config.name +
'_{date:%d-%H-%M-%S}'.format(date=datetime.now()))
os.makedirs(experiment_dir, exist_ok=True)
with open(os.path.join(experiment_dir, 'config.yaml'), 'w') as f:
f.write(OmegaConf.to_yaml(cfg))
print("进入循环")
print(cfg)
runner.run({
'train': not cfg.test,
'play': cfg.test,
'checkpoint': cfg.checkpoint,
'sigma': cfg.sigma if cfg.sigma != '' else None
})
print("出循环")
三、torch_runner.py解读
- Runner是一个完整的类,属于rl_games模块
1、初始化
- 这个algo_factory主要就是维护了一个字典,将不同的算法类存到其中
def __init__(self, algo_observer=None):
self.algo_factory = object_factory.ObjectFactory()
self.algo_factory.register_builder('a2c_continuous', lambda **kwargs : a2c_continuous.A2CAgent(**kwargs))
self.algo_factory.register_builder('a2c_discrete', lambda **kwargs : a2c_discrete.DiscreteA2CAgent(**kwargs))
self.algo_factory.register_builder('sac', lambda **kwargs: sac_agent.SACAgent(**kwargs))
#self.algo_factory.register_builder('dqn', lambda **kwargs : dqnagent.DQNAgent(**kwargs))
self.player_factory = object_factory.ObjectFactory()
self.player_factory.register_builder('a2c_continuous', lambda **kwargs : players.PpoPlayerContinuous(**kwargs))
self.player_factory.register_builder('a2c_discrete', lambda **kwargs : players.PpoPlayerDiscrete(**kwargs))
self.player_factory.register_builder('sac', lambda **kwargs : players.SACPlayer(**kwargs))
#self.player_factory.register_builder('dqn', lambda **kwargs : players.DQNPlayer(**kwargs))
self.algo_observer = algo_observer if algo_observer else DefaultAlgoObserver()
torch.backends.cudnn.benchmark = True
### it didnot help for lots for openai gym envs anyway :(
#torch.backends.cudnn.deterministic = True
#torch.use_deterministic_algorithms(True)
2、run(self, args)
- 这个函数就是根据参数选择训练模式和测试模式
def run(self, args):
if args['train']:
self.run_train(args)
elif args['play']:
self.run_play(args)
else:
self.run_train(args)
3、run_train(self, args)
- 训练模式主要是根据算法类型创建智能体
- 然后调用训练函数train()
def run_train(self, args):
print('Started to train')
print('aaaaaaaaaaaaaaaaaaaaaaaaaaaa')
print(self.algo_name)
agent = self.algo_factory.create(self.algo_name, base_name='run', params=self.params)
_restore(agent, args)
_override_sigma(agent, args)
print('1111112222333')
agent.train()
四、torch_runner.py解读
- 这里有一个完整的类A2CAgent(a2c_common.ContinuousA2CBase)
- 这个类是a2c_common.ContinuousA2CBase的衍生类,这个衍生类重新编写了部分虚函数,包括calc_gradients(计算梯度)、reg_loss(正则化)等。这些虚函数会在train()函数中调用
- 上一部分调用的train()是基类的函数。
def calc_gradients(self, input_dict):
value_preds_batch = input_dict['old_values']
old_action_log_probs_batch = input_dict['old_logp_actions']
advantage = input_dict['advantages']
old_mu_batch = input_dict['mu']
old_sigma_batch = input_dict['sigma']
return_batch = input_dict['returns']
actions_batch = input_dict['actions']
obs_batch = input_dict['obs']
obs_batch = self._preproc_obs(obs_batch)
lr_mul = 1.0
curr_e_clip = self.e_clip
batch_dict = {
'is_train': True,
'prev_actions': actions_batch,
'obs' : obs_batch,
}
rnn_masks = None
if self.is_rnn:
rnn_masks = input_dict['rnn_masks']
batch_dict['rnn_states'] = input_dict['rnn_states']
batch_dict['seq_length'] = self.seq_length
if self.zero_rnn_on_done:
batch_dict['dones'] = input_dict['dones']
with torch.cuda.amp.autocast(enabled=self.mixed_precision):
#1、取出数据
res_dict = self.model(batch_dict)
action_log_probs = res_dict['prev_neglogp']
values = res_dict['values']
entropy = res_dict['entropy']
mu = res_dict['mus']
sigma = res_dict['sigmas']
#2、计算损失,a_loss是actor损失,c_loss是critic损失,entropy是熵,b_loss是限制损失
a_loss = self.actor_loss_func(old_action_log_probs_batch, action_log_probs, advantage, self.ppo, curr_e_clip)
if self.has_value_loss:
c_loss = common_losses.critic_loss(self.model,value_preds_batch, values, curr_e_clip, return_batch, self.clip_value)
else:
c_loss = torch.zeros(1, device=self.ppo_device)
if self.bound_loss_type == 'regularisation':
b_loss = self.reg_loss(mu)
elif self.bound_loss_type == 'bound':
b_loss = self.bound_loss(mu)
else:
b_loss = torch.zeros(1, device=self.ppo_device)
losses, sum_mask = torch_ext.apply_masks([a_loss.unsqueeze(1), c_loss , entropy.unsqueeze(1), b_loss.unsqueeze(1)], rnn_masks)
a_loss, c_loss, entropy, b_loss = losses[0], losses[1], losses[2], losses[3]
loss = a_loss + 0.5 * c_loss * self.critic_coef - entropy * self.entropy_coef + b_loss * self.bounds_loss_coef
if self.multi_gpu:
self.optimizer.zero_grad()
else:
for param in self.model.parameters():
param.grad = None
self.scaler.scale(loss).backward()
#TODO: Refactor this ugliest code of they year
self.trancate_gradients_and_step()
with torch.no_grad():
reduce_kl = rnn_masks is None
kl_dist = torch_ext.policy_kl(mu.detach(), sigma.detach(), old_mu_batch, old_sigma_batch, reduce_kl)
if rnn_masks is not None:
kl_dist = (kl_dist * rnn_masks).sum() / rnn_masks.numel() #/ sum_mask
self.diagnostics.mini_batch(self,
{
'values' : value_preds_batch,
'returns' : return_batch,
'new_neglogp' : action_log_probs,
'old_neglogp' : old_action_log_probs_batch,
'masks' : rnn_masks
}, curr_e_clip, 0)
self.train_result = (a_loss, c_loss, entropy, \
kl_dist, self.last_lr, lr_mul, \
mu.detach(), sigma.detach(), b_loss)
五、a2c_common.py解读
- 这个文件有A2CBase基类和从其衍生的连续A2C和离散A2C
1、 train()
- ContinuousA2CBase(A2CBase)中的train()函数主体是一个循环,调用self.train_epoch()执行仿真、生成数据、训练模型
def train(self):
print('zzzzzzzzzzzz')
self.init_tensors()
self.last_mean_rewards = -100500
start_time = time.time()
total_time = 0
rep_count = 0
self.obs = self.env_reset()
self.curr_frames = self.batch_size_envs
if self.multi_gpu:
print("====================broadcasting parameters")
model_params = [self.model.state_dict()]
dist.broadcast_object_list(model_params, 0)
self.model.load_state_dict(model_params[0])
while True:
epoch_num = self.update_epoch()
step_time, play_time, update_time, sum_time, a_losses, c_losses, b_losses, entropies, kls, last_lr, lr_mul = self.train_epoch()##关键就是这个函数
total_time += sum_time
frame = self.frame // self.num_agents
# cleaning memory to optimize space
self.dataset.update_values_dict(None)
should_exit = False
if self.global_rank == 0:
self.diagnostics.epoch(self, current_epoch = epoch_num)
# do we need scaled_time?
scaled_time = self.num_agents * sum_time
scaled_play_time = self.num_agents * play_time
curr_frames = self.curr_frames * self.world_size if self.multi_gpu else self.curr_frames
self.frame += curr_frames
print_statistics(self.print_stats, curr_frames, step_time, scaled_play_time, scaled_time,
epoch_num, self.max_epochs, frame, self.max_frames)
self.write_stats(total_time, epoch_num, step_time, play_time, update_time,
a_losses, c_losses, entropies, kls, last_lr, lr_mul, frame,
scaled_time, scaled_play_time, curr_frames)
if len(b_losses) > 0:
self.writer.add_scalar('losses/bounds_loss', torch_ext.mean_list(b_losses).item(), frame)
if self.has_soft_aug:
self.writer.add_scalar('losses/aug_loss', np.mean(aug_losses), frame)
if self.game_rewards.current_size > 0:
mean_rewards = self.game_rewards.get_mean()
mean_shaped_rewards = self.game_shaped_rewards.get_mean()
mean_lengths = self.game_lengths.get_mean()
self.mean_rewards = mean_rewards[0]
for i in range(self.value_size):
rewards_name = 'rewards' if i == 0 else 'rewards{0}'.format(i)
self.writer.add_scalar(rewards_name + '/step'.format(i), mean_rewards[i], frame)
self.writer.add_scalar(rewards_name + '/iter'.format(i), mean_rewards[i], epoch_num)
self.writer.add_scalar(rewards_name + '/time'.format(i), mean_rewards[i], total_time)
self.writer.add_scalar('shaped_' + rewards_name + '/step'.format(i), mean_shaped_rewards[i], frame)
self.writer.add_scalar('shaped_' + rewards_name + '/iter'.format(i), mean_shaped_rewards[i], epoch_num)
self.writer.add_scalar('shaped_' + rewards_name + '/time'.format(i), mean_shaped_rewards[i], total_time)
self.writer.add_scalar('episode_lengths/step', mean_lengths, frame)
self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num)
self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time)
if self.has_self_play_config:
self.self_play_manager.update(self)
checkpoint_name = self.config['name'] + '_ep_' + str(epoch_num) + '_rew_' + str(mean_rewards[0])
if self.save_freq > 0:
if epoch_num % self.save_freq == 0:
self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name))
if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after:
print('saving next best rewards: ', mean_rewards)
self.last_mean_rewards = mean_rewards[0]
self.save(os.path.join(self.nn_dir, self.config['name']))
if 'score_to_win' in self.config:
if self.last_mean_rewards > self.config['score_to_win']:
print('Maximum reward achieved. Network won!')
self.save(os.path.join(self.nn_dir, checkpoint_name))
should_exit = True
if epoch_num >= self.max_epochs and self.max_epochs != -1:
if self.game_rewards.current_size == 0:
print('WARNING: Max epochs reached before any env terminated at least once')
mean_rewards = -np.inf
self.save(os.path.join(self.nn_dir, 'last_' + self.config['name'] + '_ep_' + str(epoch_num) \
+ '_rew_' + str(mean_rewards).replace('[', '_').replace(']', '_')))
print('MAX EPOCHS NUM!')
should_exit = True
if self.frame >= self.max_frames and self.max_frames != -1:
if self.game_rewards.current_size == 0:
print('WARNING: Max frames reached before any env terminated at least once')
mean_rewards = -np.inf
self.save(os.path.join(self.nn_dir, 'last_' + self.config['name'] + '_frame_' + str(self.frame) \
+ '_rew_' + str(mean_rewards).replace('[', '_').replace(']', '_')))
print('MAX FRAMES NUM!')
should_exit = True
update_time = 0
if self.multi_gpu:
should_exit_t = torch.tensor(should_exit, device=self.device).float()
dist.broadcast(should_exit_t, 0)
should_exit = should_exit_t.float().item()
if should_exit:
return self.last_mean_rewards, epoch_num
if should_exit:
return self.last_mean_rewards, epoch_num
2、self.train_epoch()
- 这个函数实现仿真、计算梯度
- self.train_actor_critic(self.dataset[i])用于计算价值梯度
- batch_dict = self.play_steps()获得仿真数据
def train_epoch(self):
super().train_epoch()
self.set_eval()
play_time_start = time.time()
with torch.no_grad():
if self.is_rnn:
batch_dict = self.play_steps_rnn()
else:
batch_dict = self.play_steps()
self.set_train()
play_time_end = time.time()
update_time_start = time.time()
rnn_masks = batch_dict.get('rnn_masks', None)
self.curr_frames = batch_dict.pop('played_frames')
self.prepare_dataset(batch_dict)
self.algo_observer.after_steps()
a_losses = []
c_losses = []
entropies = []
kls = []
if self.has_central_value:
self.train_central_value()
for mini_ep in range(0, self.mini_epochs_num):
ep_kls = []
for i in range(len(self.dataset)):
a_loss, c_loss, entropy, kl, last_lr, lr_mul = self.train_actor_critic(self.dataset[i])
a_losses.append(a_loss)
c_losses.append(c_loss)
ep_kls.append(kl)
entropies.append(entropy)
av_kls = torch_ext.mean_list(ep_kls)
if self.multi_gpu:
dist.all_reduce(av_kls, op=dist.ReduceOp.SUM)
av_kls /= self.world_size
self.last_lr, self.entropy_coef = self.scheduler.update(self.last_lr, self.entropy_coef, self.epoch_num, 0, av_kls.item())
self.update_lr(self.last_lr)
kls.append(av_kls)
self.diagnostics.mini_epoch(self, mini_ep)
if self.normalize_input:
self.model.running_mean_std.eval() # don't need to update statstics more than one miniepoch
update_time_end = time.time()
play_time = play_time_end - play_time_start
update_time = update_time_end - update_time_start
total_time = update_time_end - play_time_start
return batch_dict['step_time'], play_time, update_time, total_time, a_losses, c_losses, entropies, kls, last_lr, lr_mul
3、play_steps(self)
- 这个函数主要就是计算动作、输出状态、奖励,也是一个循环类型的代码
- 计算动作:res_dict = self.get_action_values(self.obs)
- 进行仿真:self.obs, rewards, self.dones, infos = self.env_step(res_dict[‘actions’]) #执行仿真
def play_steps(self):
update_list = self.update_list
step_time = 0.0
for n in range(self.horizon_length):
if self.use_action_masks:
masks = self.vec_env.get_action_masks()
res_dict = self.get_masked_action_values(self.obs, masks)
else:
res_dict = self.get_action_values(self.obs)
self.experience_buffer.update_data('obses', n, self.obs['obs'])
self.experience_buffer.update_data('dones', n, self.dones)
for k in update_list:
self.experience_buffer.update_data(k, n, res_dict[k])
if self.has_central_value:
self.experience_buffer.update_data('states', n, self.obs['states'])
step_time_start = time.time()
self.obs, rewards, self.dones, infos = self.env_step(res_dict['actions']) #执行仿真
step_time_end = time.time()
step_time += (step_time_end - step_time_start)
shaped_rewards = self.rewards_shaper(rewards)
if self.value_bootstrap and 'time_outs' in infos:
shaped_rewards += self.gamma * res_dict['values'] * self.cast_obs(infos['time_outs']).unsqueeze(1).float()
self.experience_buffer.update_data('rewards', n, shaped_rewards)
self.current_rewards += rewards
self.current_shaped_rewards += shaped_rewards
self.current_lengths += 1
all_done_indices = self.dones.nonzero(as_tuple=False)
env_done_indices = all_done_indices[::self.num_agents]
self.game_rewards.update(self.current_rewards[env_done_indices])
self.game_shaped_rewards.update(self.current_shaped_rewards[env_done_indices])
self.game_lengths.update(self.current_lengths[env_done_indices])
self.algo_observer.process_infos(infos, env_done_indices)
not_dones = 1.0 - self.dones.float()
self.current_rewards = self.current_rewards * not_dones.unsqueeze(1)
self.current_shaped_rewards = self.current_shaped_rewards * not_dones.unsqueeze(1)
self.current_lengths = self.current_lengths * not_dones
last_values = self.get_values(self.obs)
fdones = self.dones.float()
mb_fdones = self.experience_buffer.tensor_dict['dones'].float()
mb_values = self.experience_buffer.tensor_dict['values']
mb_rewards = self.experience_buffer.tensor_dict['rewards']
mb_advs = self.discount_values(fdones, last_values, mb_fdones, mb_values, mb_rewards)
mb_returns = mb_advs + mb_values
batch_dict = self.experience_buffer.get_transformed_list(swap_and_flatten01, self.tensor_list)
batch_dict['returns'] = swap_and_flatten01(mb_returns)
batch_dict['played_frames'] = self.batch_size
batch_dict['step_time'] = step_time
return batch_dict
4、env_step(self, actions)
- 关键就是self.vec_env.step(actions) 这个函数,执行仿真
def env_step(self, actions):
actions = self.preprocess_actions(actions) #动作预处理
obs, rewards, dones, infos = self.vec_env.step(actions) #环境更新
if self.is_tensor_obses:
if self.value_size == 1:
rewards = rewards.unsqueeze(1)
return self.obs_to_tensors(obs), rewards.to(self.ppo_device), dones.to(self.ppo_device), infos
else:
if self.value_size == 1:
rewards = np.expand_dims(rewards, axis=1)
return self.obs_to_tensors(obs), torch.from_numpy(rewards).to(self.ppo_device).float(), torch.from_numpy(dones).to(self.ppo_device), infos
六、cartpole.py解读
- vec_env可以被create_vec_env这个函数创建,通过下面的代码可以看出这个环境vec_env就是通过字典vecenv_config调用的,vecenv_config字典在最开始第一个文件的代码就调用了,在第一个文件中就根据配置指定了cartpole
vecenv_config = {}
def register(config_name, func):
vecenv_config[config_name] = func
def create_vec_env(config_name, num_actors, **kwargs):
vec_env_name = configurations[config_name]['vecenv_type']
return vecenv_config[vec_env_name](config_name, num_actors, **kwargs)
- vec_env.step()这个函数就是任务类Cartpole(VecTask)的函数,这个函数来自基类VecTask
- self.pre_physics_step(action_tensor)在Cartpole中重写,输入动作
- self.render()渲染画面,也是这个类中实现的调用了step_graphics和draw_viewer
- self.gym.simulate(self.sim)实现单步仿真
- self.post_physics_step()存储仿真环境的新状态
def step(self, actions: torch.Tensor) -> Tuple[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor, Dict[str, Any]]:
"""Step the physics of the environment.
Args:
actions: actions to apply
Returns:
Observations, rewards, resets, info
Observations are dict of observations (currently only one member called 'obs')
"""
# randomize actions
if self.dr_randomizations.get('actions', None):
actions = self.dr_randomizations['actions']['noise_lambda'](actions)
action_tensor = torch.clamp(actions, -self.clip_actions, self.clip_actions)
# apply actions
self.pre_physics_step(action_tensor)
# step physics and render each frame
for i in range(self.control_freq_inv):
if self.force_render:
self.render()
self.gym.simulate(self.sim) #仿真一步
# to fix!
if self.device == 'cpu':
self.gym.fetch_results(self.sim, True)
# compute observations, rewards, resets, ...
self.post_physics_step()
self.control_steps += 1
# fill time out buffer: set to 1 if we reached the max episode length AND the reset buffer is 1. Timeout == 1 makes sense only if the reset buffer is 1.
self.timeout_buf = (self.progress_buf >= self.max_episode_length - 1) & (self.reset_buf != 0)
# randomize observations
if self.dr_randomizations.get('observations', None):
self.obs_buf = self.dr_randomizations['observations']['noise_lambda'](self.obs_buf)
self.extras["time_outs"] = self.timeout_buf.to(self.rl_device)
self.obs_dict["obs"] = torch.clamp(self.obs_buf, -self.clip_obs, self.clip_obs).to(self.rl_device)
# asymmetric actor-critic
if self.num_states > 0:
self.obs_dict["states"] = self.get_state()
return self.obs_dict, self.rew_buf.to(self.rl_device), self.reset_buf.to(self.rl_device), self.extras
七、整体流程
- 这个是从train.py到isaac gym仿真的流程,从上到下依次调用
train.py[main] 命令行执行
train.py[runner.run]
torch_runner.py[self.run_train(args)]
torch_runner.py[agent.train()]
a2c_continuous.py[基类a2c_common.ContinuousA2CBase]
a2c_common.py[train(self)]
a2c_common.py[train_epoch(self)]
a2c_common.py[self.play_steps()]
a2c_common.py[self.env_step()]
cartpole.py[基类VecTask]
vec_task.py[step()]
vec_task.py[self.gym.simulate(self.sim)] 调用isaac的仿真函数