基本上就是最基础的DDPG算法,稍微改了reward机制(不改是真收敛不了),大概200轮左右收敛(我设置的是每轮超过1000次动作之后还是没有抵达终点的话跳出)
首先我们需要了解state的具体情况,state[0]是山地车的位置,起始位置是-0.5;state[1]是山地车的速度,负的表示向左移动,正的表示向右移动,依据这个就可以设计奖励值了。
1、第一种奖励值设置方法是基于state[0]的,也就是reward=abs(state[0]+0.5),表示离起点越远,奖励越高,不过根据实际来说效果一般
2、第二种奖励值设置方法是基于state[1]的,也就是reward=abs(state[1]),表示速度越大,给予越大的奖励值,前期效果还行,但后期很难抵达终点(大概原因就是他认为不去终点更容易拿到更多奖励)
3、第三种奖励值设置方法还是基于state[1]的,只不过把思路稍微转换一下,即reward=abs(state[1])-2,因为时间拖的越久,total_reward肯定越低,这样的话他在第一次抵达终点之后,会愿意更多的去终点,解决了第二种方法的缺陷
除了以上三种奖励设置之外,还有更多的奖励设置方法(比如把state[0]和state[1]结合起来之类的)就留给你们自己去想了,总的来说第三种方法是我目前找到的最容易收敛的方法
其他关于ddpg部分,与传统的ddpg并没有什么区别,这个我这里就不再讲了,感兴趣的可以去搜集其他资料
# deterministic policy gradient
import gymnasium as gym
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import matplotlib.pyplot as plt
import random
from collections import deque
Env = 'MountainCarContinuous-v0'
CIN = 2
COUT = 1
Action_range = [[-1,1]] #动作范围
Episodes = 1*10**4
Pool_size = 1*10**4
Pool_sample = 64
Hidden = 512
Gamma = 0.99
Policy_LR = 3e-4
Value_LR = 3e-3
Tau = 0.005 # 软更新参数
Sigma = 0.1 # 高斯噪声标准差
class Policy_Net(nn.Module):
def __init__(self):
super().__init__()
self.seq = nn.Sequential(
nn.Linear(CIN,Hidden),
nn.ReLU(),
nn.Linear(Hidden, Hidden),
nn.ReLU(),
nn.Linear(Hidden,COUT),
nn.Tanh()
)
def forward(self,x):
return self.seq(x)
class Value_Net(nn.Module):
def __init__(self):
super().__init__()
self.seq = nn.Sequential(
nn.Linear((CIN+COUT),Hidden),
nn.ReLU(),
nn.Linear(Hidden, Hidden),
nn.ReLU(),
nn.Linear(Hidden,1)
)
def forward(self,s,a):
x = torch.cat([s,a],dim=1)
return self.seq(x)
class Pool:# 经验回放池
def __init__(self):
self.pool = {
'state':deque([]),
'action':deque([]),
'reward':deque([]),
'next_state':deque([]),
'terminated':deque([])
}
self.cnt = 0
def append(self,state,action,reward,next_state,terminated):
self.cnt += 1
for i,j in zip(self.pool.keys(),[state,action,reward,next_state,terminated]):
self.pool[i].append(j)
if len(self) > Pool_size:
for i in self.pool.keys():
self.pool[i].popleft()
def sample(self):
s0,a0,r1,s1,te = [],[],[],[],[]
for _ in range(Pool_sample):
rand = random.randint(0,len(self)-1)
for i,j in zip(self.pool.keys(),[s0,a0,r1,s1,te]):
j.append(self.pool[i][rand])
return (torch.tensor(i).float().view(Pool_sample,-1).to(device) for i in (s0,a0,r1,s1,te))
def __len__(self):
return len(self.pool['state'])
class Agent:# 智能体
def __init__(self):
self.critic = Value_Net().to(device)
self.actor = Policy_Net().to(device)
self.tar_critic = Value_Net().to(device)
self.tar_actor = Policy_Net().to(device)
self.optim_critic = torch.optim.Adam(self.critic.parameters(), lr=Value_LR)
self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=Policy_LR)
def learn(self,pool):
s0,a0,r1,s1,te = pool.sample()
# critic
td_now = self.critic(s0,a0)
td_tar = r1 + Gamma * self.tar_critic(s1,self.tar_actor(s1)) * (1-te)
loss = nn.functional.mse_loss(td_now,td_tar.detach())
self.optim_critic.zero_grad()
loss.backward()
self.optim_critic.step()
# policy
loss = -self.critic(s0,self.actor(s0)).mean()
self.optim_actor.zero_grad()
loss.backward()
self.optim_actor.step()
self.soft_update()
def soft_update(self):
for tar,use in zip(self.tar_critic.parameters(),self.critic.parameters()):
tar.data.copy_(tar.data * (1-Tau) + use.data * Tau)
for tar,use in zip(self.tar_actor.parameters(),self.actor.parameters()):
tar.data.copy_(tar.data * (1-Tau) + use.data * Tau)
def choose_action(self,state):
state = torch.tensor(state).float().view(1,-1).to(device)
print(self.actor(state).item())
return [self.actor(state).item() + Sigma * random.uniform(-1,1)]
env = gym.make(Env)
# env = gym.make(Env,render_mode='human')
pool = Pool()
agent = Agent()
plt_reward = [0]
for episode in range(Episodes):
state = env.reset()[0]
cnt = 0
total_reward = 0
terminated = False
while not terminated:
cnt += 1
if cnt >= 1000:break
action = agent.choose_action(state)
next_state,reward,terminated,*_ = env.step(action)
pool.append(state,action,abs(state[1])-2,next_state,terminated)
agent.learn(pool)
total_reward += abs(state[1])-2
state = next_state
print(f'Episode {episode} total_reward {total_reward}')
plt_reward.append(plt_reward[-1]+0.1*(total_reward-plt_reward[-1]))
plt.clf()
plt.plot([*range(len(plt_reward))],plt_reward)
plt.pause(0.03)
plt.show()