问题:PPO算法训练LunarLander-v2,结果展示。
代码
环境要求:requirements.txt
python >= 3.8
absl-py==2.1.0
ale-py==0.8.1
asttokens==2.4.1
atomicwrites==1.4.1
attrs==24.2.0
backcall==0.2.0
box2d-py==2.3.5
certifi==2024.7.4
cffi==1.17.0
charset-normalizer==3.3.2
cloudpickle==3.0.0
colorama==0.4.6
contourpy==1.1.1
cycler==0.12.1
Cython==3.0.0a10
decorator==4.4.2
executing==2.1.0
fasteners==0.19
fonttools==4.53.1
glfw==2.7.0
gym==0.26.0
gym_examples==0.0.1
gym-notices==0.0.8
Gymnasium==0.26.3
gymnasium-notices==0.0.1
idna==3.7
imageio==2.35.1
imageio-ffmpeg==0.5.1
importlib_metadata==8.4.0
importlib_resources==6.4.4
iniconfig==2.0.0
ipython==8.12.3
jedi==0.19.1
kiwisolver==1.4.5
lz4==4.3.3
matplotlib==3.7.5
matplotlib-inline==0.1.7
moviepy==1.0.3
mujoco==2.2.0
mujoco-py==2.1.2.14
numpy==1.24.4
opencv-python==4.10.0.84
packaging==24.1
pandas==2.0.3
parso==0.8.4
pickleshare==0.7.5
pillow==10.4.0
pip==24.2
pluggy==1.5.0
proglog==0.1.10
prompt_toolkit==3.0.47
pure_eval==0.2.3
py==1.11.0
pycparser==2.22
pygame==2.1.0
Pygments==2.18.0
PyOpenGL==3.1.7
pyparsing==3.1.2
pytest==7.0.1
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.32.3
seaborn==0.13.2
setuptools==72.1.0
six==1.16.0
stack-data==0.6.3
swig==4.2.1
tomli==2.0.1
torch==1.12.1+cu113
torchaudio==0.12.1+cu113
torchvision==0.13.1+cu113
tqdm==4.66.5
traitlets==5.14.3
typing_extensions==4.12.2
tzdata==2024.1
urllib3==2.2.2
wcwidth==0.2.13
wheel==0.43.0
zipp==3.20.0
# 代码用于离散环境的模型 采用 Clip形式的PPO
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
import gym
from tqdm import tqdm
import os
# ----------------------------------- #
# 构建策略网络--actor
# ----------------------------------- #
class PolicyNet(nn.Module):
def __init__(self, n_states, n_hiddens, n_actions):
super(PolicyNet, self).__init__()
self.fc1 = nn.Linear(n_states, n_hiddens)
self.fc2 = nn.Linear(n_hiddens, n_hiddens)
self.fc3 = nn.Linear(n_hiddens, n_actions)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
x = F.softmax(x, dim=1)
return x
# ----------------------------------- #
# 构建价值网络--critic
# ----------------------------------- #
class ValueNet(nn.Module):
def __init__(self, n_states, n_hiddens):
super(ValueNet, self).__init__()
self.fc1 = nn.Linear(n_states, n_hiddens)
self.fc2 = nn.Linear(n_hiddens, n_hiddens)
self.fc3 = nn.Linear(n_hiddens, 1)
def forward(self, x):
x = self.fc1(x) # [b,n_states]-->[b,n_hiddens]
x = F.relu(x)
x = self.fc2(x) # [b,n_hiddens]-->[b,1] 评价当前的状态价值state_value
x = F.relu(x)
x = self.fc3(x)
return x
# ----------------------------------- #
# 构建模型
# ----------------------------------- #
class PPO:
def __init__(self, n_states, n_hiddens, n_actions, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device):
# 实例化策略网络
self.actor = PolicyNet(n_states, n_hiddens, n_actions).to(device)
# 实例化价值网络
self.critic = ValueNet(n_states, n_hiddens).to(device)
# 策略网络的优化器
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=0.0005)
# 价值网络的优化器
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=0.0005)
self.gamma = gamma # 折扣因子
self.lmbda = lmbda # GAE优势函数的缩放系数
self.epochs = epochs # 一条序列的数据用来训练的轮数
self.eps = eps # PPO中截断范围的参数
self.device = device
# 动作选择
def take_action(self, state):
# 维度变换 [n_state]-->tensor[1,n_states]
state = torch.tensor(state[np.newaxis, :]).to(self.device)
# 当前状态下,每个动作的概率分布 [1,n_states]
probs = self.actor(state)
# 创建以probs为标准的概率分布
action_list = torch.distributions.Categorical(probs)
# 依据其概率随机挑选一个动作
action = action_list.sample().item()
return action
# 存储actor和critic权重
def save_weight(self, route_actor, route_critic, episode, rewards):
torch.save(self.actor.state_dict(),
os.path.join(route_actor, 'actor_net_{}_{}.pt'.format(episode, rewards)))
torch.save(self.critic.state_dict(),
os.path.join(route_critic, 'critic_net_{}_{}.pt'.format(episode, rewards)))
# 训练
def learn(self, transition_dict):
# 提取数据集
states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
actions = torch.tensor(transition_dict['actions']).to(self.device).view(-1, 1)
rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).to(self.device).view(-1, 1)
next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)
dones = torch.tensor(transition_dict['dones'], dtype=torch.float).to(self.device).view(-1, 1)
# 目标,下一个状态的state_value [b,1]
next_q_target = self.critic(next_states)
# 目标,当前状态的state_value [b,1]
td_target = rewards + self.gamma * next_q_target * (1 - dones)
# 预测,当前状态的state_value [b,1]
td_value = self.critic(states)
# 目标值和预测值state_value之差 [b,1]
td_delta = td_target - td_value
# 时序差分值 tensor-->numpy [b,1]
td_delta = td_delta.cpu().detach().numpy()
advantage = 0 # 优势函数初始化
advantage_list = []
# 计算优势函数
for delta in td_delta[::-1]: # 逆序时序差分值 axis=1轴上倒着取 [], [], []
# 优势函数GAE的公式
advantage = self.gamma * self.lmbda * advantage + delta
advantage_list.append(advantage)
# 正序
advantage_list.reverse()
# numpy --> tensor [b,1]
advantage = torch.tensor(advantage_list, dtype=torch.float).to(self.device)
# 策略网络给出每个动作的概率,根据action得到当前时刻下该动作的概率
old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach()
# 一组数据训练 epochs 轮
for _ in range(self.epochs):
# 每一轮更新一次策略网络预测的状态
log_probs = torch.log(self.actor(states).gather(1, actions))
# 新旧策略之间的比例
ratio = torch.exp(log_probs - old_log_probs)
# 近端策略优化裁剪目标函数公式的左侧项
surr1 = ratio * advantage
# 公式的右侧项,ratio小于1-eps就输出1-eps,大于1+eps就输出1+eps
surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage
# 策略网络的损失函数
actor_loss = torch.mean(-torch.min(surr1, surr2))
# 价值网络的损失函数,当前时刻的state_value - 下一时刻的state_value
critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach()))
# 梯度清0
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
# 反向传播
actor_loss.backward()
critic_loss.backward()
# 梯度更新
self.actor_optimizer.step()
self.critic_optimizer.step()
def test_and_save(agent_test, iter):
env_test = gym.make('LunarLander-v2', max_episode_steps=600) # 初始化环境
return_list = []
for _ in range(iter): # 迭代iter次,求均值
state = env_test.reset()[0] # 重置环境
returns = 0
while True: # 开始一个episode (每一个循环代表一步)
action = agent_test.take_action(state) # 输入该步对应的状态s,选择动作
s_, r, terminated, truncated, info = env_test.step(action) # 执行动作,获得反馈
returns += r
state = s_ # 更新状态
done = terminated or truncated
if done:
return_list.append(returns)
break # 该episode结束网络参数进行更新,并在开始学习后每隔100次将评估网络的参数赋给目标网络)
returns_mean = np.mean(return_list)
returns_std = np.std(return_list)
if returns_mean > 200:
return True, returns_mean
else:
return False, -1
###### ----------------------------------- #########
###### 案例训练
###### ----------------------------------- #########
# device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
# ----------------------------------------- #
# 参数设置
# ----------------------------------------- #
num_episodes = 20000 # 总迭代次数
gamma = 0.99 # 折扣因子
actor_lr = 0.0003 # 策略网络的学习率
critic_lr = 0.001 # 价值网络的学习率
n_hiddens = 64 # 隐含层神经元个数
env_name = 'LunarLander-v2' # 环境名
return_list = [] # 保存每个回合的return
lmbda = 0.95 # 优势函数的缩放因子
epochs = 10 # 一组序列训练的轮数
eps = 0.2 # PPO截断范围的参数
# ----------------------------------------- #-
# 环境加载
# ----------------------------------------- #
env = gym.make(env_name, max_episode_steps=1000)
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
# ----------------------------------------- #
# 模型构建
# ----------------------------------------- #
agent = PPO(n_states=n_states,
n_hiddens=n_hiddens,
n_actions=n_actions,
actor_lr=actor_lr,
critic_lr=critic_lr,
lmbda=lmbda,
epochs=epochs,
eps=eps,
gamma=gamma,
device=device
)
max_mean_return = -100000.0
route_actor = './Actor_weights'
route_critic = './Critic_weights'
# ----------------------------------------- #
# 训练--回合更新 on_policy
# ----------------------------------------- #
for i in tqdm(range(num_episodes)):
state = env.reset()[0] # 环境重置
done = False # 任务完成的标记
episode_return = 0 # 累计每回合的reward
# 构造数据集,保存每个回合的状态数据
transition_dict = {
'states': [],
'actions': [],
'next_states': [],
'rewards': [],
'dones': [],
}
while not done:
action = agent.take_action(state) # 动作选择
next_state, reward, terminated, truncated, _ = env.step(action) # 环境更新
done = terminated or truncated
# 保存每个时刻的状态\动作\...
transition_dict['states'].append(state)
transition_dict['actions'].append(action)
transition_dict['next_states'].append(next_state)
transition_dict['rewards'].append(reward)
transition_dict['dones'].append(terminated)
# 更新状态
state = next_state
# 累计回合奖励
episode_return += reward
# 保存每个回合的return
return_list.append(episode_return)
# 模型训练
agent.learn(transition_dict)
# 求最后10个回报的平均值
return_last_10 = np.mean(return_list[-10:])
if (i+1) % 100 == 0: # 每一百步存一次
if return_last_10 > max_mean_return:
max_mean_return = return_last_10
agent.save_weight(route_actor, route_critic, i, max_mean_return)
# 打印回合信息
print(f'iter:{i}, mean return:{return_last_10}, the newest return: {episode_return}')
# -------------------------------------- #
# 绘图
# -------------------------------------- #
# plt.plot(return_list)
# plt.title('return')
# plt.show()
结果
超参数都是根据自己经验调的,花了不少时间,训练了多轮,得到一个比较好的结果的权重(这里不提供权重,自己去训练)。100回合,均值和方差如下:
测试代码:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gymnasium as gym
from tqdm import tqdm
import os
# ----------------------------------- #
# 构建策略网络--actor
# ----------------------------------- #
class PolicyNet(nn.Module):
def __init__(self, n_states, n_hiddens, n_actions):
super(PolicyNet, self).__init__()
self.fc1 = nn.Linear(n_states, n_hiddens)
self.fc2 = nn.Linear(n_hiddens, n_hiddens)
self.fc3 = nn.Linear(n_hiddens, n_actions)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
x = F.softmax(x, dim=1)
return x
# ----------------------------------- #
# 构建价值网络--critic
# ----------------------------------- #
class ValueNet(nn.Module):
def __init__(self, n_states, n_hiddens):
super(ValueNet, self).__init__()
self.fc1 = nn.Linear(n_states, n_hiddens)
self.fc2 = nn.Linear(n_hiddens, n_hiddens)
self.fc3 = nn.Linear(n_hiddens, 1)
def forward(self, x):
x = self.fc1(x) # [b,n_states]-->[b,n_hiddens]
x = F.relu(x)
x = self.fc2(x) # [b,n_hiddens]-->[b,1] 评价当前的状态价值state_value
x = F.relu(x)
x = self.fc3(x)
return x
# ----------------------------------- #
# 构建模型
# ----------------------------------- #
class PPO:
def __init__(self, n_states, n_hiddens, n_actions, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device):
# 实例化策略网络
self.actor = PolicyNet(n_states, n_hiddens, n_actions).to(device)
# 实例化价值网络
self.critic = ValueNet(n_states, n_hiddens).to(device)
# 策略网络的优化器
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, weight_decay=0.0005)
# 价值网络的优化器
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=0.0005)
self.gamma = gamma # 折扣因子
self.lmbda = lmbda # GAE优势函数的缩放系数
self.epochs = epochs # 一条序列的数据用来训练的轮数
self.eps = eps # PPO中截断范围的参数
self.device = device
# 动作选择
def take_action(self, state):
# 维度变换 [n_state]-->tensor[1,n_states]
state = torch.tensor(state[np.newaxis, :]).to(self.device)
# 当前状态下,每个动作的概率分布 [1,n_states]
probs = self.actor(state)
# 创建以probs为标准的概率分布
action_list = torch.distributions.Categorical(probs)
# 依据其概率随机挑选一个动作
action = action_list.sample().item()
return action
# 存储actor和critic权重
def save_weight(self, route_actor, route_critic, episode, rewards):
torch.save(self.actor.state_dict(),
os.path.join(route_actor, 'actor_net_{}_{}.pt'.format(episode, rewards)))
torch.save(self.critic.state_dict(),
os.path.join(route_critic, 'critic_net_{}_{}.pt'.format(episode, rewards)))
# 训练
def learn(self, transition_dict):
# 提取数据集
states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
actions = torch.tensor(transition_dict['actions']).to(self.device).view(-1, 1)
rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).to(self.device).view(-1, 1)
next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)
dones = torch.tensor(transition_dict['dones'], dtype=torch.float).to(self.device).view(-1, 1)
# 目标,下一个状态的state_value [b,1]
next_q_target = self.critic(next_states)
# 目标,当前状态的state_value [b,1]
td_target = rewards + self.gamma * next_q_target * (1 - dones)
# 预测,当前状态的state_value [b,1]
td_value = self.critic(states)
# 目标值和预测值state_value之差 [b,1]
td_delta = td_target - td_value
# 时序差分值 tensor-->numpy [b,1]
td_delta = td_delta.cpu().detach().numpy()
advantage = 0 # 优势函数初始化
advantage_list = []
# 计算优势函数
for delta in td_delta[::-1]: # 逆序时序差分值 axis=1轴上倒着取 [], [], []
# 优势函数GAE的公式
advantage = self.gamma * self.lmbda * advantage + delta
advantage_list.append(advantage)
# 正序
advantage_list.reverse()
# numpy --> tensor [b,1]
advantage = torch.tensor(advantage_list, dtype=torch.float).to(self.device)
# 策略网络给出每个动作的概率,根据action得到当前时刻下该动作的概率
old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach()
# 一组数据训练 epochs 轮
for _ in range(self.epochs):
# 每一轮更新一次策略网络预测的状态
log_probs = torch.log(self.actor(states).gather(1, actions))
# 新旧策略之间的比例
ratio = torch.exp(log_probs - old_log_probs)
# 近端策略优化裁剪目标函数公式的左侧项
surr1 = ratio * advantage
# 公式的右侧项,ratio小于1-eps就输出1-eps,大于1+eps就输出1+eps
surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage
# 策略网络的损失函数
actor_loss = torch.mean(-torch.min(surr1, surr2))
# 价值网络的损失函数,当前时刻的state_value - 下一时刻的state_value
critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach()))
# 梯度清0
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
# 反向传播
actor_loss.backward()
critic_loss.backward()
# 梯度更新
self.actor_optimizer.step()
self.critic_optimizer.step()
###### ----------------------------------- #########
###### 案例训练
###### ----------------------------------- #########
# device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
# ----------------------------------------- #
# 参数设置
# ----------------------------------------- #
num_episodes = 20000 # 总迭代次数
gamma = 0.99 # 折扣因子
actor_lr = 0.0003 # 策略网络的学习率
critic_lr = 0.001 # 价值网络的学习率
n_hiddens = 64 # 隐含层神经元个数
env_name = 'LunarLander-v2' # 环境名
return_list = [] # 保存每个回合的return
lmbda = 0.95 # 优势函数的缩放因子
epochs = 10 # 一组序列训练的轮数
eps = 0.2 # PPO截断范围的参数
# ----------------------------------------- #-
# 环境加载
# ----------------------------------------- #
env = gym.make(env_name, max_episode_steps=1000)
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
# ----------------------------------------- #
# 模型构建
# ----------------------------------------- #
agent = PPO(n_states=n_states,
n_hiddens=n_hiddens,
n_actions=n_actions,
actor_lr=actor_lr,
critic_lr=critic_lr,
lmbda=lmbda,
epochs=epochs,
eps=eps,
gamma=gamma,
device=device
)
env = gym.make('LunarLander-v2', max_episode_steps=600) # 初始化环境
agent.actor.load_state_dict(torch.load('./Actor_weights/actor_net_11600_265.37872430028006.pt'))
agent.critic.load_state_dict(torch.load('./Critic_weights/critic_net_11600_265.37872430028006.pt'))
returns_list = []
for i in range(100):
state = env.reset()[0] # 重置环境
returns = 0
while True: # 开始一个episode (每一个循环代表一步)
action = agent.take_action(state) # 输入该步对应的状态s,选择动作
s_, r, terminated, truncated, info = env.step(action) # 执行动作,获得反馈
returns += r
state = s_ # 更新状态
done = terminated or truncated
if done:
print("{} itr, returns: {}".format(i+1, returns))
return_list.append(returns)
break # 该episode结束网络参数进行更新,并在开始学习后每隔100次将评估网络的参数赋给目标网络)
returns_mean = np.mean(return_list)
returns_std = np.std(return_list)
print("The mean returns : {} +/- {}".format(returns_mean, returns_std))
env.close()
100次均值和方差:
1 itr, returns: 278.4110081221079
2 itr, returns: 234.14997884039028
3 itr, returns: 257.5893326395186
4 itr, returns: 284.75276394511627
5 itr, returns: 290.3131784551
6 itr, returns: 261.1476744279624
7 itr, returns: 230.69724689787256
8 itr, returns: 256.9773008013037
9 itr, returns: 244.26628558218982
10 itr, returns: 292.2101416599869
11 itr, returns: 259.5525937642354
12 itr, returns: 243.17087967265547
13 itr, returns: 263.8951475809203
14 itr, returns: 244.5178880782347
15 itr, returns: 248.1148973157124
16 itr, returns: 265.11365427390405
17 itr, returns: 285.53578730283505
18 itr, returns: 280.0755086722593
19 itr, returns: 306.3884083781254
20 itr, returns: 264.21717269513255
21 itr, returns: 221.882426416192
22 itr, returns: 265.1480876911288
23 itr, returns: 273.64825193535063
24 itr, returns: 290.3195267240245
25 itr, returns: 263.278184656814
26 itr, returns: 286.8899645342925
27 itr, returns: 293.318089995125
28 itr, returns: 275.2151624268086
29 itr, returns: 267.1210722068299
30 itr, returns: 265.03313380206066
31 itr, returns: 270.0987776695826
32 itr, returns: 261.2106187177669
33 itr, returns: 252.20351866607362
34 itr, returns: 246.20261775384256
35 itr, returns: 299.000414234853
36 itr, returns: 236.23517131389363
37 itr, returns: 290.72150000713333
38 itr, returns: 249.48753909472134
39 itr, returns: 256.37399316151635
40 itr, returns: 270.2502115550187
41 itr, returns: 232.3121514486511
42 itr, returns: 249.12939369529659
43 itr, returns: 255.9020792836925
44 itr, returns: 301.7658919552199
45 itr, returns: 255.4604937837166
46 itr, returns: 273.10367687731593
47 itr, returns: 276.6675210903388
48 itr, returns: 261.1863571035764
49 itr, returns: 273.0257258124115
50 itr, returns: 255.68778912255243
51 itr, returns: 261.3278435028139
52 itr, returns: 283.85736835896796
53 itr, returns: 275.8134497969644
54 itr, returns: 280.65977835330614
55 itr, returns: 262.0105692888053
56 itr, returns: 217.9409993393779
57 itr, returns: 265.5830485825259
58 itr, returns: 244.99977855092024
59 itr, returns: 252.23072799239526
60 itr, returns: 294.54698649859927
61 itr, returns: 278.9200241514112
62 itr, returns: 277.63202308356034
63 itr, returns: 244.8576790916366
64 itr, returns: 245.7964471302949
65 itr, returns: 300.9317370094189
66 itr, returns: 291.3273634068473
67 itr, returns: 276.2324445409035
68 itr, returns: 264.6959275422398
69 itr, returns: 234.35206240850505
70 itr, returns: 298.66508594528597
71 itr, returns: 312.02552328898594
72 itr, returns: 228.56759308210505
73 itr, returns: 259.61512585431257
74 itr, returns: 262.01121496316165
75 itr, returns: 277.6988579497752
76 itr, returns: 277.54242748761635
77 itr, returns: 298.4727125389718
78 itr, returns: 284.5508287557583
79 itr, returns: 244.7304846531017
80 itr, returns: 252.14952293847324
81 itr, returns: 244.17955129929766
82 itr, returns: 299.27898386780146
83 itr, returns: 251.77999640720213
84 itr, returns: 287.5474819586275
85 itr, returns: 286.06851592289263
86 itr, returns: 270.3006541088115
87 itr, returns: 243.48406723314622
88 itr, returns: 277.530599155113
89 itr, returns: 242.18011210897922
90 itr, returns: 284.6848996497762
91 itr, returns: 252.71576396487472
92 itr, returns: 266.5619299108769
93 itr, returns: 275.77132211572405
94 itr, returns: 290.64491938845566
95 itr, returns: 272.1502263845723
96 itr, returns: 259.3298487674714
97 itr, returns: 245.75714666078747
98 itr, returns: 256.8684813412717
99 itr, returns: 272.242801240159
100 itr, returns: 244.93767826530046
The mean returns : 266.3273680767955 +/- 20.15884602860312