策略梯度Policy Gradient
本文介绍策略梯度算法Policy Gradient在连续动作空间上的实例。
以OpenAI Gym上的LunarLanderContinuous-v2游戏环境为例。
原理与公式推导
略,先挖坑
代码
import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='CartPole-v0')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--exploration_end', type=int, default=100, metavar='N', #
help='number of episodes with noise (default: 100)')
parser.add_argument('--seed', type=int, default=123, metavar='N', # 随机数种子
help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N', # 一个episode最长持续帧数
help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=2000, metavar='N', # 训练episode数量
help='number of episodes (default: 2000)')
parser.add_argument('--hidden_size',