强化学习嵌入Transformer(代码实践)

这里写目录标题

ChatGPT的答案

# 定义Transformer模块
class Transformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers):
        super(Transformer, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.linear = nn.Linear(input_dim, hidden_dim)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = self.linear(x)
        return x

# 定义策略网络
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, num_heads, num_layers):
        super(PolicyNetwork, self).__init__()
        self.transformer = Transformer(input_dim, hidden_dim, num_heads, num_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.transformer(x)
        x = F.softmax(self.fc(x), dim=-1)
        return x

问:为什么只有Encoder 没有decoder?
在强化学习任务中,特别是在策略梯度方法中,通常不需要一个完整的Transformer模型,包括Encoder和Decoder。这是因为在这些任务中,我们通常只需要一个从状态到动作的映射,而不需要生成序列。

因此,我们可以只使用Transformer的Encoder部分,将状态作为输入,经过编码后得到一个表示状态的向量,然后将这个向量传递给策略网络(或者Critic网络)来进行动作选择或值函数估计。

使用Transformer的Encoder部分可以有效地处理状态的变长输入,并且具有对序列建模的能力,这在处理许多强化学习任务中是非常有用的。

因此,在PPO算法中,我们只需使用Transformer的Encoder部分,而不需要Decoder部分。在强化学习中,尤其是在处理具有复杂状态和动作空间的任务时,能够捕获状态序列间的依赖关系对于预测未来奖励和决策至关重要。Transformer通过其自注意力机制,能够有效地处理这种序列依赖,从而提升策略性能。

GPT4.0

# time: 2024/3/1 21:31
# author: YanJP
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Categorical
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler


# Transformer特征提取器
class TransformerFeatureExtractor(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, dropout=0.1):
        super(TransformerFeatureExtractor, self).__init__()
        self.model_dim = model_dim
        encoder_layers = TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layer=encoder_layers, num_layers=num_layers)
        self.input_projection = nn.Linear(input_dim, model_dim)

    def forward(self, x):
        x = self.input_projection(x)
        # 添加一个序列长度的维度,假设x的原始形状为(batch, model_dim)
        x = x.unsqueeze(0)  # 现在x的形状变为(1, batch, model_dim)
        output = self.transformer_encoder(x)
        return output.squeeze(0)  # 移除序列长度的维度,恢复到(batch, model_dim)


# PPO网络定义
class PPONetwork(nn.Module):
    def __init__(self, state_dim, action_dim, model_dim=64, num_heads=4, num_layers=4):
        super(PPONetwork, self).__init__()
        self.feature_extractor = TransformerFeatureExtractor(input_dim=state_dim, model_dim=model_dim,
                                                             num_heads=num_heads, num_layers=num_layers)
        self.policy_head = nn.Linear(model_dim, action_dim)
        self.value_head = nn.Linear(model_dim, 1)

    def forward(self, state):
        features = self.feature_extractor(state)
        # features = features[:, -1, :]  # 使用最后一个时间步的特征
        action_probs = torch.softmax(self.policy_head(features), dim=-1)
        state_values = self.value_head(features)
        return action_probs, state_values


# PPO Agent
class PPOAgent:
    def __init__(self, env):
        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.network = PPONetwork(self.state_dim, self.action_dim)
        self.optimizer = optim.Adam(self.network.parameters(), lr=2.5e-4)
        self.gamma = 0.99
        self.lamda = 0.95
        self.eps_clip = 0.2
        self.K_epoch = 4
        self.buffer_capacity = 1000
        self.batch_size = 64
        self.buffer = {'states': [], 'actions': [], 'log_probs': [], 'rewards': [], 'is_terminals': []}

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action_probs, _ = self.network(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def put_data(self, transition):
        self.buffer['states'].append(transition[0])
        self.buffer['actions'].append(transition[1])
        self.buffer['log_probs'].append(transition[2])
        self.buffer['rewards'].append(transition[3])
        self.buffer['is_terminals'].append(transition[4])

    def train_net(self):
        R = 0
        discounted_rewards = []
        for reward, is_terminal in zip(reversed(self.buffer['rewards']), reversed(self.buffer['is_terminals'])):
            if is_terminal:
                R = 0
            R = reward + (self.gamma * R)
            discounted_rewards.insert(0, R)

        discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)
        old_states = torch.tensor(np.array(self.buffer['states']), dtype=torch.float32)
        old_actions = torch.tensor(self.buffer['actions']).view(-1, 1)
        old_log_probs = torch.tensor(self.buffer['log_probs']).view(-1, 1)

        # Normalize the rewards
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)

        for _ in range(self.K_epoch):
            for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer['states']))), self.batch_size, False):
                # Extract batches
                state_sample = old_states[index]
                action_sample = old_actions[index]
                old_log_probs_sample = old_log_probs[index]
                returns_sample = discounted_rewards[index].view(-1, 1)

                # Get current policies
                action_probs, state_values = self.network(state_sample)
                dist = Categorical(action_probs)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action_sample.squeeze(-1))

                # Calculating the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(new_log_probs - old_log_probs_sample.detach())

                # Calculating Surrogate Loss:
                advantages = returns_sample - state_values.detach()
                surr1 = ratios * advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
                loss = -torch.min(surr1, surr2) + 0.5 * (state_values - returns_sample).pow(2) - 0.01 * entropy

                # take gradient step
                self.optimizer.zero_grad()
                loss.mean().backward()
                self.optimizer.step()
        self.buffer = {'states': [], 'actions': [], 'log_probs': [], 'rewards': [], 'is_terminals': []}

    def train(self, max_episodes):
        for episode in range(max_episodes):
            state = self.env.reset()
            done = False
            rewards=0
            while not done:
                action, log_prob = self.select_action(state)
                next_state, reward, done, _ = self.env.step(action)
                rewards+=reward
                self.put_data((state, action, log_prob, reward, done))
                state = next_state

                if done:
                    self.train_net()
                    if episode % 5 == 0:
                        print("eposide:", episode, '\t reward:', rewards)



# 主函数
def main():
    env = gym.make('CartPole-v1')
    agent = PPOAgent(env)
    max_episodes = 300
    agent.train(max_episodes)


if __name__ == "__main__":
    main()

注意:代码能跑,但是不能正常学习到策略!!!!!!!!!!!!!!!!!!!!!!!!!!!!

  • 6
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值