Actor-Critic算法
1.Actor网络
Actor网络是基于策略的策略梯度(policy-gradient)算法,基于概率选择行为。
Actor直接按照当前策略和环境交互,然后将交互后的到的奖励直接优化当前策略。
2.Critic网络
Critic网络是基于值(value)函数的Q-Learning算法,用来评判Actor网络的行为得分,Critic网络的更新采用梯度下降的方法。
Critic直接通过当前的值函数获得策略与环境交互,交互得到的奖励用来优化当前值函数,进而帮助Actor进行策略更新。
3.Actor-Critic算法
Actor用于选择动作,Critic评论选择该动作的好坏。Actor选择动作的方法是依据存储的策略。Critic的评论是根据TD error获得的,TD error是根据当前的值函数计算获得的。
对于状态值函数的 T D e r r o r = V s t , 预测 − V s t , 真实 TD\quad error=V_{s_{t},预测}-V_{s_{t},真实} TDerror=Vst,预测−Vst,真实
= r t + 1 + V s t + 1 , 真实 − V s t , 真实 =r_{t+1}+V_{s_{t+1},真实}-V_{s_{t},真实} =rt+1+Vst+1,真实−Vst,真实
对于动作值函数的 T D e r r o r = Q s t , a t , 预测 − Q s t , a t , 真实 TD\quad error=Q_{s_t,a_t,预测}-Q_{s_t,a_t,真实} TDerror=Qst,at,预测−Qst,at,真实
= r t + 1 + Q s t + 1 , a t + 1 , 真实 − V s t , a t , 真实 =r_{t+1}+Q_{s_{t+1},a_{t+1},真实}-V_{s_t,a_t,真实} =rt+1+Qst+1,at+1,真实−Vst,at,真实
4.算法流程
在第t步根据 π ( a ∣ s t , θ t ) \pi(a|s_t,\theta_t) π(a∣st,θt)产生 a t a_t at,得到 r t + 1 , s t + 1 r_{t+1},s_{t+1} rt+1,st+1,然后根据 π ( a ∣ s t + 1 , θ t ) \pi(a|s_{t+1},\theta_t) π(a∣st+1,θt)产生 θ t + 1 \theta_{t+1} θt+1,得到一组数据:
( s t , a t , r t + 1 , s t + 1 , a t + 1 ) (s_t,a_t,r_{t+1},s_{t+1},a_{t+1}) (st,at,rt+1,st+1,at+1),可以发现,这组数据就是SARSA里面用到的数据。
1.Critic(value update)
ω t + 1 = ω t + α ω × [ Q s t , a t , ω t , 预测 − Q s t , a t , ω t , 真实 ] × ∂ ω Q s t , a t , ω t \omega_{t+1}=\omega_t+\alpha_{\omega}\times[Q_{s_t,a_t,\omega_t,预测}-Q_{s_t,a_t,\omega_t,真实}]\times\partial_\omega{Q_{s_t,a_t,\omega_t}} ωt+1=ωt+αω×[Qst,at,ωt,预测−Qst,at,ωt,真实]×∂ωQst,at,ωt
= ω t + α ω × [ r t + 1 + Q s t + 1 , a t + 1 , ω t , 真实 − Q s t , a t , ω t , 真实 ] × ∂ ω Q s t , a t , ω t =\omega_t+\alpha_{\omega}\times[r_{t+1}+Q_{s_{t+1},a_{t+1},\omega_t,真实}-Q_{s_t,a_t,\omega_t,真实}]\times\partial_\omega{Q_{s_t,a_t,\omega_t}} =ωt+αω×[rt+1+Qst+1,at+1,ωt,真实−Qst,at,ωt,真实]×∂ωQst,at,ωt
在这里, ω \omega ω更新后会得到一个新的 Q Q Q
2.Actor(policy update)
θ t + 1 = θ t + α θ × ∂ θ ln π ( a t ∣ s t , θ t ) × Q ( s t , a t , ω t + 1 ) \theta_{t+1}=\theta_t+\alpha_{\theta}\times\partial_{\theta}\ln\pi(a_t|s_t,\theta_t)\times Q(s_t,a_t,\omega_{t+1}) θt+1=θt+αθ×∂θlnπ(at∣st,θt)×Q(st,at,ωt+1)
5.Actor-Critic算法可选形式
6.总结
Critic:SARSA+value function approximation
Actor:policy update
7.代码
RL_brain.py
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
# ------------------------------------ #
# 策略梯度Actor,动作选择
# ------------------------------------ #
class PolicyNet(nn.Module):
def __init__(self, n_states, n_hiddens, n_actions):
super(PolicyNet, self).__init__()
self.fc1 = nn.Linear(n_states, n_hiddens)
self.fc2 = nn.Linear(n_hiddens, n_actions)
# 前向传播
def forward(self, x):
x = self.fc1(x) # [b,n_states]-->[b,n_hiddens]
x = F.relu(x)
x = self.fc2(x) # [b,n_hiddens]-->[b,n_actions]
# 每个状态对应的动作的概率
x = F.softmax(x, dim=1) # [b,n_actions]-->[b,n_actions]
return x
# ------------------------------------ #
# 值函数Critic,动作评估输出 shape=[b,1]
# ------------------------------------ #
class ValueNet(nn.Module):
def __init__(self, n_states, n_hiddens):
super(ValueNet, self).__init__()
self.fc1 = nn.Linear(n_states, n_hiddens)
self.fc2 = nn.Linear(n_hiddens, 1)
# 前向传播
def forward(self, x):
x = self.fc1(x) # [b,n_states]-->[b,n_hiddens]
x = F.relu(x)
x = self.fc2(x) # [b,n_hiddens]-->[b,1]
return x
# ------------------------------------ #
# Actor-Critic
# ------------------------------------ #
class ActorCritic:
def __init__(self, n_states, n_hiddens, n_actions,
actor_lr, critic_lr, gamma):
# 属性分配
self.gamma = gamma
# 实例化策略网络
self.actor = PolicyNet(n_states, n_hiddens, n_actions)
# 实例化价值网络
self.critic = ValueNet(n_states, n_hiddens)
# 策略网络的优化器
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
# 价值网络的优化器
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
# 动作选择
def take_action(self, state):
# 维度变换numpy[n_states]-->[1,n_sates]-->tensor
state = torch.tensor(state[np.newaxis, :])
# 动作价值函数,当前状态下各个动作的概率
probs = self.actor(state)
# 创建以probs为标准类型的数据分布
action_dist = torch.distributions.Categorical(probs)
# 随机选择一个动作 tensor-->int
action = action_dist.sample().item()
# 返回动作
return action
# 模型更新
def update(self, transition_dict):
# 训练集
states = torch.tensor(transition_dict['states'], dtype=torch.float)
actions = torch.tensor(transition_dict['actions']).view(-1,1)
rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1)
next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float)
dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1)
# 预测的当前时刻的state_value
td_value = self.critic(states)
# 目标的当前时刻的state_value
td_target = rewards + self.gamma * self.critic(next_states) * (1-dones)
# 时序差分的误差计算,目标的state_value与预测的state_value之差
td_delta = td_target - td_value
# 对每个状态对应的动作价值用log函数
log_probs = torch.log(self.actor(states).gather(1, actions))
# 策略梯度损失
actor_loss = torch.mean(-log_probs * td_delta.detach())
# 值函数损失,预测值和目标值之间
critic_loss = torch.mean(F.mse_loss(self.critic(states), td_target.detach()))
# 优化器梯度清0
self.actor_optimizer.zero_grad() # 策略梯度网络的优化器
self.critic_optimizer.zero_grad() # 价值网络的优化器
# 反向传播
actor_loss.backward()
critic_loss.backward()
# 参数更新
self.actor_optimizer.step()
self.critic_optimizer.step()
run.py
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
from RL_brain import ActorCritic
import wandb
import os
import random
# 使用自己在wandb网站上面申请的密钥
os.environ["WANDB_API_KEY"] = '***********************************************'
def seed_torch(seed):
# 设置随机种子,确保每次运行结果都相同
torch.manual_seed(seed)
if torch.backends.cudnn.enabled:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# ----------------------------------------- #
# 参数设置
# ----------------------------------------- #
num_episodes = 3000 # 总迭代次数
gamma = 0.98 # 折扣因子
actor_lr = 1e-3 # 策略网络的学习率
critic_lr = 1e-2 # 价值网络的学习率
n_hiddens = 128 # 隐含层神经元个数
env_name = 'CartPole-v1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# return_list = [] # 保存每个回合的return
if torch.cuda.is_available():
# 如果GPU可用,则将模型放到GPU运行
device = torch.device("cuda")
seed = 42
# ----------------------------------------- #
# 环境加载
# ----------------------------------------- #
env = gym.make(env_name, render_mode="human")
n_states = env.observation_space.shape[0] # 状态数 4
n_actions = env.action_space.n # 动作数 2
np.random.seed(seed)
random.seed(seed)
seed_torch(seed)
env.reset(seed=seed)
wandb.init(
project="Actor-Critic",
name="Actor-Critic",
config={
"env_name": env_name,
"num_episodes": num_episodes,
"gamma": gamma,
"actor_lr": actor_lr,
"critic_lr": critic_lr,
"n_hiddens": n_hiddens,
"n_states": n_states,
"n_actions": n_actions,
"device": device
}
)
# ----------------------------------------- #
# 模型构建
# ----------------------------------------- #
agent = ActorCritic(n_states=n_states, # 状态数
n_hiddens=n_hiddens, # 隐含层数
n_actions=n_actions, # 动作数
actor_lr=actor_lr, # 策略网络学习率
critic_lr=critic_lr, # 价值网络学习率
gamma=gamma) # 折扣因子
# ----------------------------------------- #
# 训练--回合更新
# ----------------------------------------- #
for i in range(num_episodes):
state = env.reset()[0] # 环境重置
done = False # 任务完成的标记
episode_return = 0 # 累计每回合的reward
# env.render()
# 构造数据集,保存每个回合的状态数据
transition_dict = {
'states': [],
'actions': [],
'next_states': [],
'rewards': [],
'dones': [],
}
while not done:
action = agent.take_action(state) # 动作选择
next_state, reward, done, _, _= env.step(action) # 环境更新
# 保存每个时刻的状态\动作\...
transition_dict['states'].append(state)
transition_dict['actions'].append(action)
transition_dict['next_states'].append(next_state)
transition_dict['rewards'].append(reward)
transition_dict['dones'].append(done)
# 更新状态
state = next_state
# 累计回合奖励
episode_return += reward
# 当累积奖励大于3000时,强制退出
if episode_return>=3000:
break
# 保存每个回合的return
# return_list.append(episode_return)
# 模型训练
agent.update(transition_dict)
# 打印回合信息
print(f'iter:{i}, return:{episode_return}')
wandb.log(
{"episode_return": episode_return},
)
# -------------------------------------- #
# 绘图
# -------------------------------------- #
wandb.finish()
env.close()
···