ptan实战6 || ptan实现cartpole
前期准备:
-
定义NN网络
-
初始化环境env,确定好观察空间和动作空间大小
-
初始化net=NN 网络,此时net就为训练网络,然后定义目标网络
ptan.agent.TargetNet(net)
-
定义动作选择器selector:
ptan.actions.ArgmaxActionSelector()
、ptan.actions.EpsilonGreedyActionSelector()
,需要写两行代码# 返回第1个维度最大值的索引 selector = ptan.actions.ArgmaxActionSelector() # [0,epsilon]的概率选择随机,[epsilon,1]的概率选择最大值 selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0) # 概率分布动作选择器 需要搭配PolicyAgent的softmax使用 selector = ptan.actions.ProbabilityActionSelector() print("Actions sampled from three prob distributions:") for _ in range(10): # 定义三个分布,相当于神经网络输出的策略函数的动作概率,第一个分布索引为1的概率是80% # acts是按照这些概率分布抽样得到的动作索引,即返回下标 acts = selector(np.array([ [0.1, 0.8, 0.1], [0.0, 0.0, 1.0], [0.5, 0.5, 0.0] ]))
使用:
selector = ptan.actions.ArgmaxActionSelector() selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=EPS_DECAY, selector=selector) # EPS_DECAY表示随机动作的概率,该函数要与上面一行一起使用
-
定义智能体agent:
ptan.agent.DQNAgent()
,需要参数net网络和selector动作选择器ptan.agent.DQNAgent() # 策略函数的agent,可以和ProbabilityActionSelector()动作选择器使用 ptan.agent.PolicyAgent()
使用:
agent = ptan.agent.DQNAgent(net, selector) # 定义agent
-
定义经验源
ptan.experience.ExperienceSourceFirstLast()
,需要参数env和agent,能自动地根据agent内的selector与环境交互,包括初始化环境,选择动作,执行动作,状态更替# 均匀采样 ptan.experience.ExperienceReplayBuffer() # 带有优先级的采样,采样复杂度O(n) ptan.experience.PrioReplayBufferNaive() # 区间树采样 O(log(n)) ptan.experience.PrioritizedReplayBuffer()
使用:
# steps_count=1表示只迭代一步动作,gamma=1.0表示这一步动作得到的奖励不会衰减 exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=1.0, steps_count=1) # 每次产生(s,a,r,s_)
-
定义经验池
ptan.experience.ExperienceReplayBuffer()
,需要传入经验源和池子容量大小buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=BUFFER_SIZE)
重要函数:
exp_source.pop_rewards_steps()
此函数会返回最近所完成的一整局游戏的(reward,steps)元组,即该局游戏的总奖励和执行的步数
完整代码:
import gym
import ptan
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
HIDDEN_SIZE = 128
BATCH_SIZE = 32
TGT_NET_SYNC = 300
GAMMA = 0.9
BUFFER_SIZE = 2000
LR = 0.01
EPS_DECAY=0.99
class Net(nn.Module):
def __init__(self, obs_size, hidden_size, n_actions):
super(Net, self).__init__()
self.net = nn.Sequential(
nn.Linear(obs_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, n_actions)
)
def forward(self, x):
return self.net(x.float())
@torch.no_grad()
def unpack_batch(batch, net, gamma): # 返回Q(s_,a)
states = []
actions = []
rewards = []
done_masks = []
last_states = []
for exp in batch:
states.append(exp.state)
actions.append(exp.action)
rewards.append(exp.reward)
# 如果s_为none,则说明当前动作执行完后就结束
done_masks.append(exp.last_state is None)
# 如果下一个s_为none,则令s_=s
if exp.last_state is None:
last_states.append(exp.state)
else:
last_states.append(exp.last_state)
states_v = torch.tensor(states)
actions_v = torch.tensor(actions)
rewards_v = torch.tensor(rewards)
last_states_v = torch.tensor(last_states)
last_state_q_v = net(last_states_v)
best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
best_last_q_v[done_masks] = 0.0 # 若s_为结束后的状态,则Q(s_,a)=0
# 返回s,a和目标网络的Q值
return states_v, actions_v, best_last_q_v * gamma + rewards_v
if __name__ == "__main__":
env = gym.make("CartPole-v0")
env = env.unwrapped
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
net = Net(obs_size, HIDDEN_SIZE, n_actions)
tgt_net = ptan.agent.TargetNet(net) # 定义目标网络
selector = ptan.actions.ArgmaxActionSelector()
selector = ptan.actions.EpsilonGreedyActionSelector(
epsilon=EPS_DECAY, selector=selector) # EPS_DECAY表示随机动作的概率,该函数要与上面一行一起使用
agent = ptan.agent.DQNAgent(net, selector) # 定义agent
# steps_count=1表示只迭代一步动作,gamma=1.0表示这一步动作得到的奖励不会衰减
exp_source = ptan.experience.ExperienceSourceFirstLast(
env, agent, gamma=1.0, steps_count=1) # 每次产生(s,a,r,s_)
buffer = ptan.experience.ExperienceReplayBuffer(
exp_source, buffer_size=BUFFER_SIZE)
optimizer = optim.Adam(net.parameters(), LR)
step = 0
episode = 0
solved = False
mean_20reward = []
best_mean_reward = None
mean_reward = 0.0
# 训练
while True:
step += 1
buffer.populate(1) # 从经验源产生一个样本存入经验池,即执行一步动作
# 每执行完一局就输出一些信息,如果没有执行完一局,就不会执行这个for循环
for reward, steps in exp_source.pop_rewards_steps():
episode += 1
print("%d: 第 %d 局游戏结束, reward=%.3f, epsilon=%.2f 共执行 %d 步" % (
step, episode, reward, selector.epsilon, steps))
mean_20reward.append(reward)
mean_reward = np.mean(mean_20reward[-20:])
if best_mean_reward is None or mean_reward > best_mean_reward:
if best_mean_reward is not None:
print("最近20局平均奖励更新 %.2f => %.2f"%(best_mean_reward,mean_reward))
best_mean_reward = mean_reward
solved = mean_reward > 200
if solved:
print("Congrats!")
break
if len(buffer) < BUFFER_SIZE:
continue
# 抽样出经验条并且计算出Q(s_,a)
batch = buffer.sample(BATCH_SIZE)
states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA)
# 更新训练网络
optimizer.zero_grad()
q_v = net(states_v)
q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
loss_v = F.mse_loss(q_v, tgt_q_v)
loss_v.backward()
optimizer.step()
# 随机选择动作的概率以每步乘以EPS_DECAY的频率衰减
selector.epsilon *= EPS_DECAY
# 同步更新训练网络的参数至目标网络
if step % TGT_NET_SYNC == 0:
tgt_net.sync()