所有代码,可能会因为gym,pytorch甚至python版本的更迭输出可能会不同甚至可能失效,不要因为代码失效而刻意选择特定版本的库,最关键的是理解算法的本质和编程语言的基本特性
CartPole-v0相当于是gym中的Hello world环境
在python中创建该环境
import gym
env = gym.make('CartPole-v0')
查看智能体可用的action
env.action_space
---
输出是:
Discrete(2)
也就是说action是离散的值,且只能在rang(2)=0,1中选择
查看环境的观测空间:
env.observation_space
---
输出是(具体输出因版本而异):
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
可以看到,观测空间是4个变量,变量的上下限也在输出中给出了。
CartPole-v0要解决的是一个一阶倒立摆的控制问题:
我们的思路和之前说过的一样,先采样一个batch的episode,然后用reward_bound滤去reward较低的episode,剩下来的episode是相对“优秀”的episode,可以拿去训练神经网。不断重复这个人工优选,辅助神经网训练的过程,不断提高reward。
from collections import namedtuple
import torch
from torch import nn
import gym
import numpy as np
from tensorboardX import SummaryWriter
## 所有的常数定义
HIDDEN_SIZE = 128 # 隐藏层神经元个数
BATCH_SIZE = 16 # 每批采样的大小
PERCENTILE = 70 # reward阈值的百分数
class Net(nn.Module):
'''
创建一个简单的MPL神经网络
'''
def __init__(self, input_size, hidden_size, output_size):
super(Net, self).__init__()
self.net = nn.Sequential(nn.Linear(input_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, output_size))
def forward(self, x):
return self.net(x)
# 一个namedtuple相当于简单的结构体
Episode = namedtuple('Episode', field_names=['rewards', 'steps']) # Episode记录了每段history中所有的reward和每一步的信息,每一步的信息是一个EpisodeStep
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action']) # EpisodeStep记录了每一步的信息,包括当前时刻的observation和action
def batch_iterator(env: gym.Env, net, batch_size):
'''
每次随机抽样batch_size次episode并返回每个batch的信息
'''
softmax = nn.Softmax(0)
observation = env.reset()
batch = []
steps = []
rewards = 0
while True:
action_prob = softmax(net(torch.tensor(observation))).detach().numpy() # 之前定义的神经网的输出是每种action的logits,这里用softmax函数将logits进行概率归一化
action = np.random.choice(len(action_prob), p=action_prob) # 按照action的概率分布进行抽样
next_observation, reward, done, _ = env.step(action) # 迭代一个step
rewards += reward # 累计reward,假设gamma是1
step = EpisodeStep(observation=observation, action=action) # 记录每个step的信息,包括observation和执行的action
steps.append(step) # 将每个step的信息加入steps列表中
if done: # 记录一个episode
episode = Episode(rewards=rewards, steps=steps)
batch.append(episode) # 把每个episode加入batch中
rewards = 0
steps = []
next_observation = env.reset()
if len(batch) == batch_size: # 采样了一个batch之后将这个batch返回
yield batch
batch = []
observation = next_observation
def batch_filter(batch, percentile):
'''
对batch中的episode过滤掉reward低于阈值的episode,即“优选”的过程
'''
reward_list = list(map(lambda episode: episode.rewards, batch))
reward_bound = np.percentile(reward_list, percentile) # 将每个episode的reward排序,取排在percentile的reward作为这个batch的奖励阈值
reward_mean = np.mean(reward_list)
train_observation = []
train_action = []
for rewards, steps in batch:
'''
把“优选”之后的变现不错的episode拿去训练
'''
if rewards < reward_bound:
continue
train_observation.extend(map(lambda step: step.observation, steps))
train_action.extend(map(lambda step: step.action, steps))
return torch.tensor(train_observation), torch.tensor(train_action), reward_bound, reward_mean
def test_model(env: gym.Env, net):
'''
测试训练好的网络在环境中的表现
'''
observation = env.reset()
softmax = nn.Softmax(0)
while True:
action_prob = softmax(net(torch.tensor(observation))).detach().numpy()
action = np.random.choice(len(action_prob), p=action_prob)
observation, reward, done, _ = env.step(action)
env.render()
if done:
env.reset()
if __name__ == '__main__':
env = gym.make('CartPole-v0')
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
net = Net(input_size, HIDDEN_SIZE, output_size)
observation = env.reset()
optimizer = torch.optim.Adam(params=net.parameters(), lr=0.01)
cross_entropy = nn.CrossEntropyLoss()
writer = SummaryWriter(comment='-cartpole') # 创建一个SummaryWriter对象,记录训练指标
for iter_no, batch in enumerate(batch_iterator(env, net, BATCH_SIZE)):
observation, action, reward_bound, reward_mean = batch_filter(batch, PERCENTILE)
action_hat = net(observation)
optimizer.zero_grad()
loss = cross_entropy(action_hat, action)
loss.backward()
optimizer.step()
writer.add_scalar('loss', loss.detach().numpy(), iter_no)
writer.add_scalar('reward_bound', reward_bound, iter_no)
writer.add_scalar('reward_mean', reward_mean, iter_no)
if reward_mean > 199: # 训练直到平均奖励大于199为止
break
writer.close()
test_model(env, net)
tensorboard中记录了训练过程中loss、reward_bound、reward_mean三个指标的变化,如下