Policy Gradient算法pytorch实现
策略梯度是典型的on-policy的学习方式, 通过智能体与环境的交互实现自主学习. 其流程大致如下图所示.
. . . . . . . . . ......... .........
只有当环境反馈给执行完毕的信号时, 智能体才开始执行一次学习(如果根据初始策略, 或者也可以称之为初始随机策略, 一直在环境中探索, 没有进入完成状态, 那么智能体将永远不会得到学习训练). 由此可见这是policy gradient算法的一个非常致命的确定, 在一些复杂的任务中, 比如机械臂控制,或者其它的较为复杂的游戏, policy gradient算法几乎不会收敛. 下面代码是使用pytorch对Cartpole环境下的policy gradient算法实现, 代码逻辑较为简单
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter
MAX_EPISODE = 30000
RENDER = True
class DiscretePolicyGradient(nn.Module):
def __init__(self, n_features, n_actions, num_hiddens=64):
super(DiscretePolicyGradient, self).__init__()
self.linear1 = nn.Linear(n_features, num_hiddens)
self.linear2 = nn.Linear(num_hiddens, num_hiddens)
self.output = nn.Linear(num_hiddens, n_actions)
def forward(self, state):
x = F.tanh(self.linear1(state))
x = F.tanh(self.linear2(x))
x = F.softmax(self.output(x), dim=1)
return x
class ReplayBuffer:
def __init__(self):
self.replay_buffer = []
self.count = 0
def store_transition(self, s, a, r):
self.replay_buffer.append(np.array([s, a, r]))
self.count += 1
def get_observations(self):
return np.vstack(np.vstack(self.replay_buffer)[:, 0])
def get_actions(self):
return np.vstack(np.vstack(self.replay_buffer)[:, 1])
def clear(self):
self.replay_buffer = []
self.count = 0
def get_reward(self, i):
return np.vstack(self.replay_buffer)[i, 2]
class Agent:
def __init__(self, n_features, n_actions, num_hiddens=64):
self.n_features = n_features
self.n_actions = n_actions
self.model = DiscretePolicyGradient(n_features, n_actions, num_hiddens)
self.opt = optim.Adam(self.model.parameters())
self.replay_buffer = ReplayBuffer()
self.gamma = 0.9
self.writer = SummaryWriter("./torch_cartpole_log")
# 根据当前观察选择一个action
def choose_action(self, observation):
if not isinstance(observation, torch.Tensor):
observation = torch.FloatTensor(observation)
if observation.dim() == 1:
observation = observation.unsqueeze(0)
# 关闭梯度计算
with torch.no_grad():
action_prob = self.model(observation)
c = Categorical(action_prob)
action_index = c.sample()[0]
return action_index
# 执行一次学习
def learn(self):
discounted_reward_norm: np.ndarray = self.__discount_and_norm_rewards()
observations = torch.FloatTensor(self.replay_buffer.get_observations())
actions = torch.LongTensor(self.replay_buffer.get_actions())
self.model.train()
acts_prob = self.model(observations)
# 这里需要注意: torch求梯度只能对标量求,不能对向量求解
loss = F.binary_cross_entropy(acts_prob,
torch.autograd.Variable(F.one_hot(actions.squeeze(1), self.n_actions).type(torch.float32)),
torch.from_numpy(discounted_reward_norm).unsqueeze(1))
self.opt.zero_grad()
loss.backward()
self.opt.step()
# 一次学习过后, 清空replay_buffer中的全部数据
self.replay_buffer.clear()
return np.sum(discounted_reward_norm)
# 将reward进行折扣
def __discount_and_norm_rewards(self):
size = self.replay_buffer.count
discounted_rewards = np.zeros((size))
running_add = 0
for i in range(self.replay_buffer.count - 1, -1, -1):
running_add = running_add * self.gamma + self.replay_buffer.get_reward(i)
discounted_rewards[i] = running_add
discounted_rewards -= np.mean(discounted_rewards)
discounted_rewards /= np.std(discounted_rewards)
return discounted_rewards
def run():
env = gym.make('CartPole-v0')
env.seed(1)
n_actions = env.action_space.n
n_features = env.observation_space.shape[0]
agent = Agent(n_features, n_actions)
for episode in range(MAX_EPISODE):
observation = env.reset()
while True:
if RENDER:
env.render()
obs_tensor = torch.FloatTensor(observation).unsqueeze(0)
action = agent.choose_action(obs_tensor).cpu().numpy()
next_observation, reward, done, info = env.step(action)
agent.replay_buffer.store_transition(observation, action, reward)
observation = next_observation
if done:
agent.learn()
break
env.close()
if __name__ == '__main__':
run()