原理

代码需要改动的地方:
params = SimpleNamespace(**{
'env_name': "CartPole-v0",
'stop_reward': 250.0,
'replay_size': 2000, # buffer大小
'replay_initial': 1000, # buffer包含1000个数据时开始训练
'target_net_sync': 300, # 同步网络频率
'epsilon_frames': 600, # 分母
'epsilon_start': 1.0, # epsilon初始值
'epsilon_final': 0.1, # epsilon衰减后的稳定值
'learning_rate': 0.001,
'gamma': 0.90, # 折扣回报率
'batch_size': 32,
'steps_count': 1 # 多步dqn的步长,若此参数为1,则为普通DQN
})
exp_source = ptan.experience.ExperienceSourceFirstLast(
env, agent, gamma=params.gamma, steps_count=params.steps_count)
loss_v = common.calc_loss_dqn(buffer.sample(params.batch_size),
net, tgt_net.target_model.to(device),
params.gamma**params.steps_count, # n步DQN,需要n次方折扣率
device)
全部代码:
import gym
import ptan
import torch
import torch.optim as optim
import numpy as np
from lib import dqn_model,common
from tensorboardX import SummaryWriter
from types import SimpleNamespace
params = SimpleNamespace(**{
'env_name': "CartPole-v0",
'stop_reward': 250.0,
'replay_size': 2000, # buffer大小
'replay_initial': 1000, # buffer包含1000个数据时开始训练
'target_net_sync': 300, # 同步网络频率
'epsilon_frames': 600, # 分母
'epsilon_start': 1.0, # epsilon初始值
'epsilon_final': 0.1, # epsilon衰减后的稳定值
'learning_rate': 0.001,
'gamma': 0.90, # 折扣回报率
'batch_size': 32,
'steps_count': 1 # 多步dqn的步长,若此参数为1,则为普通DQN
})
writer = SummaryWriter(comment="_"+str(params.steps_count)+"_step_DQN")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
env = gym.make(params.env_name)
env = env.unwrapped
STATE_SPACE = env.observation_space.shape[0]
ACTION_SPACE = env.action_space.n
net = dqn_model.DQN(STATE_SPACE, ACTION_SPACE).to(device)
tgt_net = ptan.agent.TargetNet(net)
selector = ptan.actions.ArgmaxActionSelector()
# epsilon_start表示随机动作的概率,该函数要与上面一行一起使用
selector = ptan.actions.EpsilonGreedyActionSelector(
epsilon=params.epsilon_start, selector=selector)
epsilon_tracker = common.EpsilonTracker(selector, params) # Epsilon衰减器
agent = ptan.agent.DQNAgent(net, selector, device=device) # 定义agent
# 定义经验源和经验池
exp_source = ptan.experience.ExperienceSourceFirstLast(
env, agent, gamma=params.gamma, steps_count=params.steps_count)
buffer = ptan.experience.ExperienceReplayBuffer(
exp_source, buffer_size=params.replay_size)
# 定义优化器
optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)
epsilon_step = 0
total_step = 0
step = 0
solved = False
mean_20reward = []
best_mean_reward = None
mean_reward = 0.0
while True:
total_step += 1
buffer.populate(1) # 从经验源产生一个样本存入经验池,即执行一步动作
# 每执行完一局就输出一些信息,如果没有执行完一局,就不会执行这个for循环
for reward, game_steps in exp_source.pop_rewards_steps():
step += 1
print("第 %d 局游戏结束, reward=%.3f, epsilon=%.2f 共执行 %d 步 总步数 %d" % (
step, reward, epsilon_tracker.selector.epsilon, game_steps, total_step))
mean_20reward.append(reward)
mean_reward = np.mean(mean_20reward[-20:])
writer.add_scalar("mean_20reward", mean_reward, step)
if best_mean_reward is None or mean_reward > best_mean_reward:
if best_mean_reward is not None:
print("最近20局平均奖励更新 %.2f => %.2f"%(best_mean_reward,mean_reward))
best_mean_reward = mean_reward
solved = mean_reward > params.stop_reward
if solved:
print("Congrats!")
break
if len(buffer) < params.replay_initial:
continue
# 训练
optimizer.zero_grad()
loss_v = common.calc_loss_dqn(buffer.sample(params.batch_size),
net, tgt_net.target_model.to(device),
params.gamma**params.steps_count,
device)
loss_v.backward()
optimizer.step()
writer.add_scalar("loss", loss_v.item(), epsilon_step)
epsilon_step += 1
# 随机选择动作的概率以每步乘以EPS_DECAY的频率衰减
epsilon_tracker.frame(epsilon_step)
writer.add_scalar("epsilon", epsilon_tracker.selector.epsilon, epsilon_step)
# 同步更新训练网络的参数至目标网络
if epsilon_step % params.target_net_sync == 0:
tgt_net.sync()
writer.close()
比较:
综合来看,2步DQN效果好于普通DQN和3步DQN