强化学习经典算法笔记(九):LSTM加持的PolicyGradient算法
在上文《强化学习经典算法笔记(八):LSTM加持的A2C算法解决POMDP问题》的基础上,实现了LSTM+MLP的Policy Gradient算法。
实现过程如下:
import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
plt.ion()
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='LunarLanderContinuous-v2')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--seed', type=int, default=123, metavar='N', # 随机数种子
help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N', # 一个episode最长持续帧数
help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=2000, metavar='N', # 训练episode数量
help='number of episodes (default: 2000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N', # 神经网络隐层神经元数量
help='number of episodes (default: 128)')
parser.add_argument('--render', action='store_true',
help='render the environment')
parser.add_argument('--ckpt_freq', type=int,<