ptan实战6 || ptan实现cartpole

ptan实战6 || ptan实现cartpole

前期准备:

  1. 定义NN网络

  2. 初始化环境env,确定好观察空间和动作空间大小

  3. 初始化net=NN 网络,此时net就为训练网络,然后定义目标网络ptan.agent.TargetNet(net)

  4. 定义动作选择器selector:ptan.actions.ArgmaxActionSelector()ptan.actions.EpsilonGreedyActionSelector(),需要写两行代码

    # 返回第1个维度最大值的索引
    selector = ptan.actions.ArgmaxActionSelector()
    
    # [0,epsilon]的概率选择随机,[epsilon,1]的概率选择最大值
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0)
    
    # 概率分布动作选择器 需要搭配PolicyAgent的softmax使用
    selector = ptan.actions.ProbabilityActionSelector()
    print("Actions sampled from three prob distributions:")
    for _ in range(10):
        # 定义三个分布,相当于神经网络输出的策略函数的动作概率,第一个分布索引为1的概率是80%
        # acts是按照这些概率分布抽样得到的动作索引,即返回下标
        acts = selector(np.array([
            [0.1, 0.8, 0.1],
            [0.0, 0.0, 1.0],
            [0.5, 0.5, 0.0]
        ]))
    

    使用:

    selector = ptan.actions.ArgmaxActionSelector()
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=EPS_DECAY, selector=selector)   # EPS_DECAY表示随机动作的概率,该函数要与上面一行一起使用
    
  5. 定义智能体agent:ptan.agent.DQNAgent(),需要参数net网络和selector动作选择器

    ptan.agent.DQNAgent()
    
    # 策略函数的agent,可以和ProbabilityActionSelector()动作选择器使用
    ptan.agent.PolicyAgent()
    

    使用:

    agent = ptan.agent.DQNAgent(net, selector)  # 定义agent
    
  6. 定义经验源ptan.experience.ExperienceSourceFirstLast(),需要参数env和agent,能自动地根据agent内的selector与环境交互,包括初始化环境,选择动作,执行动作,状态更替

    # 均匀采样
    ptan.experience.ExperienceReplayBuffer()
    
    # 带有优先级的采样,采样复杂度O(n)
    ptan.experience.PrioReplayBufferNaive()
    
    # 区间树采样 O(log(n))
    ptan.experience.PrioritizedReplayBuffer()
    

    使用:

    # steps_count=1表示只迭代一步动作,gamma=1.0表示这一步动作得到的奖励不会衰减
    exp_source = ptan.experience.ExperienceSourceFirstLast(
            env, agent, gamma=1.0, steps_count=1)   # 每次产生(s,a,r,s_)
    
  7. 定义经验池ptan.experience.ExperienceReplayBuffer(),需要传入经验源和池子容量大小

    buffer = ptan.experience.ExperienceReplayBuffer(
            exp_source, buffer_size=BUFFER_SIZE)
    

重要函数:

exp_source.pop_rewards_steps()此函数会返回最近所完成的一整局游戏的(reward,steps)元组,即该局游戏的总奖励和执行的步数

完整代码:

import gym
import ptan
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

HIDDEN_SIZE = 128
BATCH_SIZE = 32
TGT_NET_SYNC = 300
GAMMA = 0.9
BUFFER_SIZE = 2000
LR = 0.01
EPS_DECAY=0.99

class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x.float())

@torch.no_grad()
def unpack_batch(batch, net, gamma):    # 返回Q(s_,a)
    states = []
    actions = []
    rewards = []
    done_masks = []
    last_states = []

    for exp in batch:
        states.append(exp.state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        # 如果s_为none,则说明当前动作执行完后就结束
        done_masks.append(exp.last_state is None)
        # 如果下一个s_为none,则令s_=s
        if exp.last_state is None:
            last_states.append(exp.state)
        else:
            last_states.append(exp.last_state)

    states_v = torch.tensor(states)
    actions_v = torch.tensor(actions)
    rewards_v = torch.tensor(rewards)
    last_states_v = torch.tensor(last_states)

    last_state_q_v = net(last_states_v)
    best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
    best_last_q_v[done_masks] = 0.0         # 若s_为结束后的状态,则Q(s_,a)=0
    # 返回s,a和目标网络的Q值
    return states_v, actions_v, best_last_q_v * gamma + rewards_v

if __name__ == "__main__":

    env = gym.make("CartPole-v0")
    env = env.unwrapped
    obs_size = env.observation_space.shape[0]
    n_actions = env.action_space.n

    net = Net(obs_size, HIDDEN_SIZE, n_actions)

    tgt_net = ptan.agent.TargetNet(net)         # 定义目标网络

    selector = ptan.actions.ArgmaxActionSelector()
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=EPS_DECAY, selector=selector)   # EPS_DECAY表示随机动作的概率,该函数要与上面一行一起使用

    agent = ptan.agent.DQNAgent(net, selector)  # 定义agent

    # steps_count=1表示只迭代一步动作,gamma=1.0表示这一步动作得到的奖励不会衰减
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=1.0, steps_count=1)   # 每次产生(s,a,r,s_)

    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=BUFFER_SIZE)

    optimizer = optim.Adam(net.parameters(), LR)

    step = 0
    episode = 0
    solved = False
    mean_20reward = []
    best_mean_reward = None
    mean_reward = 0.0

    # 训练
    while True:
        step += 1               
        buffer.populate(1)      # 从经验源产生一个样本存入经验池,即执行一步动作

        # 每执行完一局就输出一些信息,如果没有执行完一局,就不会执行这个for循环
        for reward, steps in exp_source.pop_rewards_steps():
            episode += 1
            print("%d: 第 %d 局游戏结束, reward=%.3f, epsilon=%.2f 共执行 %d 步" % (
                step, episode, reward, selector.epsilon, steps))

            mean_20reward.append(reward)
            mean_reward = np.mean(mean_20reward[-20:])
            if best_mean_reward is None or mean_reward > best_mean_reward:
                if best_mean_reward is not None:
                    print("最近20局平均奖励更新 %.2f => %.2f"%(best_mean_reward,mean_reward))
                best_mean_reward = mean_reward

            solved = mean_reward > 200

        if solved:
            print("Congrats!")
            break

        if len(buffer) < BUFFER_SIZE:
            continue

        # 抽样出经验条并且计算出Q(s_,a)
        batch = buffer.sample(BATCH_SIZE)
        states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA)

        # 更新训练网络
        optimizer.zero_grad()
        q_v = net(states_v)
        q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
        loss_v = F.mse_loss(q_v, tgt_q_v)
        loss_v.backward()
        optimizer.step()

        # 随机选择动作的概率以每步乘以EPS_DECAY的频率衰减
        selector.epsilon *= EPS_DECAY

        # 同步更新训练网络的参数至目标网络
        if step % TGT_NET_SYNC == 0:
            tgt_net.sync()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值