- 核心:丢弃不好的episode
- 步骤:
- 使用当前的模型和环境运行N个episode
- 计算每个episode的总奖励并确定一个奖励边界。通常,使用所有奖励的一些百分位数,例如:50%和70%
- 丢弃所有包含边界以下奖励的episode
- 用观察作为输入,已发布的动作作为期望输出训练剩余的“精华”episode
- 不断重复上述步骤
- 局限性:
- 对于训练,episode必须是有限的,最好是简短的
- episode的总奖励应具有足够的可变性,足以将好的episode与坏episode分开
- 没有关于Agent是成功还是失败的中间提示
代码表示
import gym
import torch
import numpy as np
from torch import nn
from collections import namedtuple
from tensorboardX import SummaryWriter
import torch.optim as optim
HIDDEN_SIZE = 128
BATCH_SIZE = 20
PERCENTILE = 70
class Net(nn.Module):
def __init__(self, obs_size, hidden_size, n_actions):
super(Net, self).__init__()
self.net = nn.Sequential(
nn.Linear(obs_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, n_actions)
)
def forward(self, x):
return self.net(x)
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])
def iterate_batches(env, net, batch_size):
batch = []
episode_reward = 0.0
episode_steps = []
obs = env.reset()
sm = nn.Softmax(dim=1)
while True:
obs_v = torch.FloatTensor([obs])
act_probs_v = sm(net(obs_v))
act_probs = act_probs_v.data.numpy()[0]
action = np.random.choice(len(act_probs), p=act_probs)
next_obs, reward, is_done, _ = env.step(action)
episode_reward += reward
episode_steps.append(EpisodeStep(observation=obs, action=action))
if is_done:
batch.append(Episode(reward=episode_reward, steps=episode_steps))
episode_reward = 0.0
episode_steps = []
next_obs = env.reset()
if len(batch) == batch_size:
yield batch
batch = []
obs = next_obs
def filter_batch(batch, percentile):
rewards = list(map(lambda s: s.reward, batch))
reward_bound = np.percentile(rewards, percentile)
reward_mean = float(np.mean(rewards))
train_obs = []
train_act = []
for example in batch:
if example.reward < reward_bound:
continue
train_obs.extend(map(lambda step: step.observation, example.steps))
train_act.extend(map(lambda step: step.action, example.steps))
train_obs_v = torch.FloatTensor(train_obs)
train_act_v = torch.LongTensor(train_act)
return train_obs_v, train_act_v, reward_bound, reward_mean
if __name__ == '__main__':
env = gym.make("CartPole-v0")
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
net = Net(obs_size, HIDDEN_SIZE, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter(comment="-cartpole")
for iter_no, batch in enumerate(iterate_batches(env, net, BATCH_SIZE)):
obs_v, act_v, reward_b, reward_m = filter_batch(batch, PERCENTILE)
optimizer.zero_grad()
action_scores_v = net(obs_v)
loss_v = objective(action_scores_v, act_v)
loss_v.backward()
optimizer.step()
print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b))
writer.add_scalar("loss", loss_v.item(), iter_no)
writer.add_scalar("reward_bound", reward_b, iter_no)
writer.add_scalar("reward_mean", reward_m, iter_no)
if reward_m > 199:
print("Solved!")
break
writer.close()
0: loss=0.691, reward_mean=22.8, reward_bound=26.3
1: loss=0.685, reward_mean=25.1, reward_bound=28.3
2: loss=0.671, reward_mean=26.8, reward_bound=27.5
3: loss=0.665, reward_mean=35.0, reward_bound=30.3
4: loss=0.646, reward_mean=34.9, reward_bound=34.6
5: loss=0.643, reward_mean=46.4, reward_bound=54.3
6: loss=0.630, reward_mean=42.5, reward_bound=47.3
7: loss=0.625, reward_mean=63.3, reward_bound=81.2
8: loss=0.614, reward_mean=46.5, reward_bound=45.6
9: loss=0.602, reward_mean=47.5, reward_bound=54.9
10: loss=0.617, reward_mean=60.9, reward_bound=66.2
11: loss=0.622, reward_mean=56.8, reward_bound=74.4
12: loss=0.584, reward_mean=62.5, reward_bound=82.8
13: loss=0.586, reward_mean=65.5, reward_bound=76.4
14: loss=0.584, reward_mean=73.8, reward_bound=88.2
15: loss=0.576, reward_mean=69.0, reward_bound=87.9
16: loss=0.576, reward_mean=83.0, reward_bound=90.5
17: loss=0.573, reward_mean=79.7, reward_bound=82.9
18: loss=0.570, reward_mean=73.7, reward_bound=85.9
19: loss=0.577, reward_mean=73.0, reward_bound=80.4
20: loss=0.570, reward_mean=88.9, reward_bound=90.2
21: loss=0.555, reward_mean=85.8, reward_bound=94.5
22: loss=0.567, reward_mean=114.1, reward_bound=126.3
23: loss=0.556, reward_mean=120.5, reward_bound=137.4
24: loss=0.558, reward_mean=138.1, reward_bound=161.3
25: loss=0.560, reward_mean=156.2, reward_bound=200.0
26: loss=0.566, reward_mean=164.2, reward_bound=200.0
27: loss=0.551, reward_mean=160.5, reward_bound=200.0
28: loss=0.552, reward_mean=184.5, reward_bound=200.0
29: loss=0.556, reward_mean=177.6, reward_bound=200.0
30: loss=0.546, reward_mean=186.8, reward_bound=200.0
31: loss=0.547, reward_mean=184.9, reward_bound=200.0
32: loss=0.542, reward_mean=186.2, reward_bound=200.0
33: loss=0.546, reward_mean=194.2, reward_bound=200.0
34: loss=0.547, reward_mean=196.9, reward_bound=200.0
35: loss=0.547, reward_mean=197.5, reward_bound=200.0
36: loss=0.549, reward_mean=194.4, reward_bound=200.0
37: loss=0.537, reward_mean=194.1, reward_bound=200.0
38: loss=0.536, reward_mean=200.0, reward_bound=200.0
Solved!