强化学习经典算法笔记(十八):离散动作空间REINFORCE算法

强化学习经典算法笔记(十八):离散动作空间REINFORCE算法

在文章强化学习经典算法笔记(七):策略梯度算法Policy Gradient中介绍了连续动作空间的Policy Gradient算法,最近需要用到离散空间的版本,遂写了个CartPole-v1上的PG代码。

相比于原来的PG,改动主要在select_action函数和update_parameters函数。在原来的版本中,由于动作是一个二维连续向量,所以动作的对数概率也是一个二维向量,动作熵也是二维向量,而CartPole环境就要做一定修改。

另外,vanilla PG真的很不稳定,跟股市波动有一拼。performance比较依赖调参的。
在这里插入图片描述

import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
import matplotlib.pyplot as plt

import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils
from torch.distributions import Categorical
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='CartPole-v1')
parser.add_argument('--gamma', type=float, default=0.98, metavar='G',
                    help='discount factor for reward (default: 0.99)')
parser.add_argument('--seed', type=int, default=1234, metavar='N',             # 随机数种子
                    help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N',       # 一个episode最长持续帧数
                    help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=1000, metavar='N',    # 训练episode数量
                    help='number of episodes (default: 1000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N',      # 神经网络隐层神经元数量
                    help='number of episodes (default: 128)')
parser.add_argument('--render', action='store_true',
                    help='render the environment')
parser.add_argument('--ckpt_freq', type=int, default=100, 
                    help='model saving frequency')
parser.add_argument('--display', type=bool, default=False,
                    help='display or not')
args = parser.parse_args()


env_name = args.env_name                                            # 游戏名
env = gym.make(env_name)                                            # 创建环境
env = env.unwrapped
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n


if args.display:
    env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)

env.seed(args.seed)                                                 # 随机数种子
torch.manual_seed(args.seed)                                        # Gym、numpy、Pytorch都要设置随机数种子
np.random.seed(args.seed)


class Policy(nn.Module):                                            # 神经网络定义的策略
    def __init__(self, hidden_size, s_dim, a_dim):
        super(Policy, self).__init__()

        self.linear1 = nn.Linear(s_dim, hidden_size)           # 隐层神经元数量
        self.linear2 = nn.Linear(hidden_size, a_dim)


    def forward(self, x):
        x = F.relu(self.linear1(x))
        p = F.softmax(self.linear2(x),-1)                             # 离散动作
        return p

class REINFORCE:
    def __init__(self, hidden_size, s_dim, a_dim):
        self.model = Policy(hidden_size, s_dim, a_dim)    # 创建策略网络
        # self.model = self.model.cuda()                              # GPU版本
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-2) # 优化器
        self.model.train()
        self.pi = Variable(torch.FloatTensor([math.pi])) # .cuda()    # 圆周率


    def select_action(self, state):
        # mu, sigma_sq = self.model(Variable(state).cuda())
        prob = self.model(Variable(state))
        dist = Categorical(probs=prob)
        action = dist.sample()
        log_prob = prob[0,action.item()].log()
        # log_prob = prob.log()
        entropy = dist.entropy()
        
        return action, log_prob, entropy

    def update_parameters(self, rewards, log_probs, entropies, gamma):# 更新参数
        R = torch.tensor(0)
        loss = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]                                # 倒序计算累计期望
            # loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
            #print(log_probs[i],Variable(R))
            loss = loss - (log_probs[i]*Variable(R)) - 0.005*entropies[i][0]
            
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_norm_(self.model.parameters(), 2)             # 梯度裁剪,梯度的最大L2范数=40
        self.optimizer.step()

agent = REINFORCE(args.hidden_size,s_dim,a_dim)

dir = 'ckpt_' + env_name
if not os.path.exists(dir):    
    os.mkdir(dir)

log_reward = []
log_smooth = []
for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])
    entropies = []
    log_probs = []
    rewards = []
    for t in range(args.num_steps): # 1个episode最长num_steps
        action, log_prob, entropy = agent.select_action(state)
        action = action.cpu()

        next_state, reward, done, _ = env.step(action.numpy()[0])

        entropies.append(entropy)
        log_probs.append(log_prob)
        rewards.append(reward)
        state = torch.Tensor([next_state])

        if done:
            break
    # episode结束,开始训练
    agent.update_parameters(rewards, log_probs, entropies, args.gamma)

    if i_episode%args.ckpt_freq == 0:
        torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))

    print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
    
    log_reward.append(np.sum(rewards))
    if i_episode == 0:
        log_smooth.append(log_reward[-1])
    else:
        log_smooth.append(log_smooth[-1]*0.99+0.01*np.sum(rewards))
    
    plt.plot(log_reward)
    plt.plot(log_smooth)
    plt.pause(1e-5)
env.close()
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值