Ray框架中episode_reward的含义

本文解释了强化学习中(RL)的episode_reward概念,它是从初始状态到最终状态的agent-environment交互(轨迹)中,采用discountvalue=1.0时每个步骤奖励的累计。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

该问题的解释可以参见链接https://discuss.ray.io/t/meaning-of-episode-reward-mean/3839/5

下面是具体的含义:episode_reward指的是在discount value = 1.0时一个episode的return,也就是把episode中的每一个step获得的reward进行累加

参考知识:

In RL, episodes are considered agent-environment interactions from initial to final states.,也就是一个trajectory.

Evaluate the trained EGPO agent without the expert. Please change the `CKPT_PATH` to the path of the checkpoint you want to evaluate. """ import pathlib import tqdm from egpo_utils.egpo.egpo import EGPOTrainer from egpo_utils.human_in_the_loop_env import HumanInTheLoopEnv from egpo_utils.train.utils import initialize_ray TRAINING_SCRIPT_FOLDER = pathlib.Path(__file__).parent initialize_ray(test_mode=False) def get_function(ckpt): trainer = EGPOTrainer(dict( env=HumanInTheLoopEnv, # ===== Training ===== takeover_data_discard=False, alpha=10.0, recent_episode_num=5, normalize=True, twin_cost_q=True, k_i=0.01, k_p=5, # search > 0 k_d=0.1, # expected max takeover num cost_limit=300, optimization=dict(actor_learning_rate=1e-4, critic_learning_rate=1e-4, entropy_learning_rate=1e-4), prioritized_replay=False, horizon=400, target_network_update_freq=1, timesteps_per_iteration=100, metrics_smoothing_episodes=10, learning_starts=100, clip_actions=False, normalize_actions=True, )) print("Restoring from checkpoint: ", ckpt) trainer.restore(str(ckpt)) def _f(obs): ret = trainer.compute_actions({"default_policy": obs}) return ret return _f if __name__ == '__main__': EPISODE_NUM = 50 CKPT_PATH = TRAINING_SCRIPT_FOLDER / "EGPO/EGPOTrainer_ExpertGuidedEnv_23216_00000_0_seed=0_2024-04-08_11-59-33/checkpoint_1/checkpoint-1" def make_env(env_id=None): return HumanInTheLoopEnv(dict(manual_control=False, use_render=False)) data = [] env = make_env() compute_actions = get_function(CKPT_PATH) o = env.reset() epi_num = 0 total_cost = 0 total_reward = 0 success_rate = 0 ep_cost = 0 ep_reward = 0 success_flag = False horizon = 2000 step = 0 with tqdm.tqdm(total=EPISODE_NUM, desc="Episode") as pbar: while True: # action_to_send = compute_actions(w, [o], deterministic=False)[0] step += 1 action_to_send = compute_actions(o)["default_policy"] o, r, d, info = env.step(action_to_send) total_reward += r ep_reward += r total_cost += info["cost"] ep_cost += info["cost"] if d or step > horizon: if info["arrive_dest"]: success_rate += 1 success_flag = True epi_num += 1 pbar.update(1) if epi_num > EPISODE_NUM: break else: o = env.reset() data.append({"reward": ep_reward, "success": success_flag, "cost": ep_cost}) ep_cost = 0.0 ep_reward = 0.0 success_flag = False step = 0 print( "success_rate:{}, mean_episode_reward:{}, mean_episode_cost:{}".format(success_rate / EPISODE_NUM, total_reward / EPISODE_NUM, total_cost / EPISODE_NUM)) del compute_actions env.close() import pandas as pd pd.DataFrame(data).to_csv("egpo_eval.csv")
04-03
import copy import ray from ray import tune from egpo_utils.cql.cql import CQLTrainer from egpo_utils.common import evaluation_config, ILCallBack, CQLInputReader from egpo_utils.expert_guided_env import ExpertGuidedEnv from egpo_utils.train import get_train_parser from egpo_utils.train.train import train import os data_set_file_path = os.path.join(os.path.dirname(__file__), 'expert_traj_500.json') def get_data_sampler_func(ioctx): return CQLInputReader(data_set_file_path) eval_config = copy.deepcopy(evaluation_config) eval_config["input"] = "sampler" # important to use pgdrive online evaluation eval_config["env_config"]["random_spawn"] = True if __name__ == '__main__': print(data_set_file_path) try: file = open(data_set_file_path) except FileNotFoundError: raise FileExistsError("Please collect dataset by using collect_dataset.py at first") assert ray.__version__ == "1.3.0" or ray.__version__ == "1.2.0", "ray 1.3.0 is required" args = get_train_parser().parse_args() exp_name = "CQL" or args.exp_name stop = {"timesteps_total": 100_0000_00000} config = dict( # ===== Evaluation ===== env=ExpertGuidedEnv, env_config=evaluation_config["env_config"], input_evaluation=["simulation"], evaluation_interval=1, evaluation_num_episodes=30, evaluation_config=eval_config, evaluation_num_workers=2, metrics_smoothing_episodes=20, # ===== Training ===== # cql para lagrangian=False, # Automatic temperature (alpha prime) control temperature=5, # alpha prime in paper, 5 is best in pgdrive min_q_weight=0.2, # best bc_iters=20_0000, # bc_iters > 20_0000 has no obvious improvement # offline setting no_done_at_end=True, input=get_data_sampler_func, optimization=dict(actor_learning_rate=1e-4, critic_learning_rate=1e-4, entropy_learning_rate=1e-4), rollout_fragment_length=200, prioritized_replay=False, horizon=2000, target_network_update_freq=1, timesteps_per_iteration=1000, learning_starts=10000, clip_actions=False, normalize_actions=True, num_cpus_for_driver=0.5, # No extra worker used for learning. But this config impact the evaluation workers. num_cpus_per_worker=0.1, # num_gpus_per_worker=0.1 if args.num_gpus != 0 else 0, num_gpus=0.2 if args.num_gpus != 0 else 0, framework="torch" ) train( CQLTrainer, exp_name=exp_name, keep_checkpoints_num=5, stop=stop, config=config, num_gpus=args.num_gpus, # num_seeds=2, num_seeds=5, custom_callback=ILCallBack, # test_mode=True, # local_mode=True ) 运行结果怎么可视化
最新发布
04-03
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值