import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import collections
import random
from torch import optim
class PolicyGradient(nn.Module):
def __init__(self, n_actions, n_inputs, lr=0.01):
super(PolicyGradient, self).__init__()
self.fc1 = nn.Linear(n_inputs, 64) # 输入维度为4
self.fc2 = nn.Linear(64, 128)
self.fc3 = nn.Linear(128, n_actions) # 输出维度为2
self.optimizer = optim.Adam(self.parameters(), lr=lr)
self.device = 'cpu'
self.to(self.device)
def forward(self, state):
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
actions = F.softmax(self.fc3(x), dim=1) # PG要用交叉熵损失,必须在行维度转softmax
return actions
class Agent:
def __init__(self, env, n_actions, state_n_dim, gamma=0.99):
self.env = env
self.n_actions = n_actions
self.state_n_dim = state_n_dim
self.gamma = gamma # 未来值影响参数
self.eps_min = 0.05 # epsilon会衰减,但不低于eps_min
self.eps_dec = 5e-4 # 每次衰减0.0005
self.policy_net = PolicyGradient(self.n_actions, self.state_n_dim) # 待学习的网络
self.scores = [] # 记录每个episode的得分
self.loss = 0
self.state_list, self.action_list, self.G_reward_list = [], [], []
'''选择动作,这个动作不是根据Q值来选择,而是使用softmax生成的概率来选,
不同于DQN--不需要epsilon-greedy,因为概率本身就具有随机性'''
def choose_action(self, state):
with torch.no_grad():
q = self.policy_net(torch.tensor([state], dtype=torch.float32))
'''我运行时提示np.random.choice()的q和不为1,所以把q做了处理,好像和numpy版本有关!'''
# print('处理前的网络输出q:',q)
q = np.squeeze(q)
q = np.array(q)
q /= q.sum()
# print('处理后的网络输出q:',q)
action = np.random.choice(self.n_actions, p=q) # 返回下标
# print('根据网络输出,按概率选择action:', action)
return action
def calc_reward_to_go(self, reward_list, gamma=0.6): # r_j转G_j当前时刻未来总收益
for i in range(len(reward_list) - 2, -1, -1):
# G_i = r_i + γ·G_i+1
reward_list[i] += gamma * reward_list[i + 1] # Gt
return reward_list
def learn(self): # 完成一个episode
done = False
reward_list = [] # 记录每一步的reward
state = self.env.reset() # [-0.00797301 0.02779841 -0.04731911 0.03995738]
# print('游戏开始,得到一个状态state:', state)
score = 0 # 记录一个episode的总得分
while not done:
action = self.choose_action(state) # 返回最大值对应的下标,作为action,如 0
next_state, reward, done, _ = self.env.step(action)
# print('环境交互得到next_s:', next_state, 'reward: ', reward, 'done: ', done)
score += reward
self.state_list.append(state)
self.action_list.append(action)
reward_list.append(reward)
state = next_state
self.scores.append(score)
self.G_reward_list = self.calc_reward_to_go(reward_list)
# print('G_reward_list:', self.G_reward_list)
self.policy_net.optimizer.zero_grad()
state_batch = torch.tensor(self.state_list, dtype=torch.float32)
action_batch = torch.tensor(self.action_list) # tensor([0, 1, 1, 1, 0,...])
reward_batch = torch.tensor(self.G_reward_list)
pred_list = self.policy_net(state_batch) # 预测值tensor([0.8900, 0.9800, 0.7500])
# print('pred_list: ', pred_list)
'''F.crossentrop(pre,target)中pre是未经过softmax的,这里pred_list经过softmax处理了,所以使用nll_loss(),其中,nll_loss()会自动取负值'''
loss = F.nll_loss(pred_list.log(), action_batch, reduction='none') # 直接返回loss,默认取平均
# print('loss: ',loss)
loss = torch.mean(loss * reward_batch)
# print('加上权重后取平均的loss: ', loss)
self.loss = loss.item()
loss.backward() # 反向计算梯度
self.policy_net.optimizer.step() # 梯度传播,更新参数
self.state_list, self.action_list, self.G_reward_list = [], [], [] # 清空
def save_model(self):
torch.save(self.policy_net.state_dict(), 'pg_net.pth') # 保存模型参数
if __name__ == '__main__':
round_count = 5 # 跑5次的结果取平均
round_all_score = 0
env = gym.make('CartPole-v0')
n_actions = env.action_space.n # 可选动作有2个
state_n_dims = env.observation_space.shape[0] # 状态的维度为4
for i in range(round_count):
agent = Agent(env, n_actions, state_n_dims)
episodes = 1000 # 每次跑1000个回合
# loss_list = []
for episode in range(episodes):
agent.learn() # 完成一个episode,将每个step的trainsition放入经验池
# loss_list.append(agent.loss)
print('Episode: ', episode, '| reward: ', agent.scores[episode])
avg_score = np.mean(agent.scores) # 900个episodes的平均分
print('Round: ', i, '| Average score: ', int(avg_score))
# print('该回合loss值:',loss_list) # 观察网络是否在学习,loss值是否变小
round_all_score += avg_score
agent.env.close()
print('run ', round_count, 'rounds,the score is: ', int(round_all_score / round_count))
可能有点小问题,欢迎指正_