上一篇介绍了行为克隆的模型如何调用和再训练,这篇用gym的一个环境做一下实战。
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
env_name = "LunarLander-v2"
env = gym.make(env_name)
env = DummyVecEnv([lambda : env])
model = PPO("MlpPolicy",
env=env,
batch_size=64,
gae_lambda=0.98,
gamma=0.999,
n_epochs=4,
ent_coef=0.01,
verbose=1,
tensorboard_log="./tensorboard/LunarLander-v2/"
)
model.learn(total_timesteps=1e6)
model.save("./model/LunarLander_PPO.pkl")
env = gym.make(env_name)
model = PPO.load("./model/LunarLander_PPO.pkl")
state = env.reset()
done = False
score = 0
while not done:
action, _ = model.predict(observation=state)
state, reward, done, info = env.step(action=action)
score += reward
env.render()
env.close()
score
先贴一下专家的训练代码↑
这个代码的作用是训练了一个在月球着陆的火箭舱,该火箭舱通过喷气改变姿态。
import numpy as np
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.ppo import MlpPolicy
from imitation.algorithms import bc
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
env_name = "LunarLander-v2"
rng = np.random.default_rng(0)
#读取环境和专家模型,并打分
env = gym.make(env_name)
expert = PPO.load("./model/LunarLander_PPO.pkl")
reward, _ = evaluate_policy(expert, env, 10)
print("Reward:", reward)
#生成专家轨迹
rollouts = rollout.rollout(
expert,
DummyVecEnv([lambda: RolloutInfoWrapper(env)]),
rollout.make_sample_until(min_timesteps=None, min_episodes=100),
rng=rng,
)
transitions = rollout.flatten_trajectories(rollouts)
#新建一个行为克隆模型并训练
bc_trainer = bc.BC(
observation_space=env.observation_space,
action_space=env.action_space,
demonstrations=transitions,
rng=rng,
)
bc_trainer.train(n_epochs=10)
#给这个行为克隆模型打分并保存
reward, _ = evaluate_policy(bc_trainer.policy, env, 10)
print("Reward:", reward)
bc_trainer.save_policy("launch_bc")
#运行这个模型
state = env.reset()
done = False
score = 0
while not done:
action, _ = bc_trainer.policy.predict(observation=state)
state, reward, done, info = env.step(action=action)
score += reward
env.render()
env.close()
这个代码通过行为克隆生成一个模型并检测其效果
import numpy as np
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from imitation.algorithms import bc
rng = np.random.default_rng(0)
env = gym.make("LunarLander-v2")
#读取模型并测试
newbc=bc.reconstruct_policy("launch_bc")
state = env.reset()
done = False
score = 0
while not done:
action, _ = newbc.predict(observation=state)
state, reward, done, info = env.step(action=action)
score += reward
env.render()
env.close()
reward, _ = evaluate_policy(newbc, env, 10)
print("newbcReward:", reward)
#新建一个PPO模型
expert = PPO("MlpPolicy",
env=env,
batch_size=64,
gae_lambda=0.98,
gamma=0.999,
n_epochs=4,
ent_coef=0.01,
verbose=1,
tensorboard_log="./tensorboard/LunarLander-v2/"
)
#将PPO的网络替换
expert.policy=newbc
reward, _ = evaluate_policy(expert, env, 10)
print("exReward:", reward)
#再训练
expert.learn(10000)
reward, _ = evaluate_policy(expert, env, 10)
print("ex2Reward:", reward)
上述代码是读取了bc模型并再训练