基于LSTM的DDPG实现

这两天实在不想动这个东西,想了想还是毕业要紧。
稍微跟自己搭的环境结合了一下,对于高维的状态输入可以完成训练(但效果没测试,至少跑通了),并且加入了batch训练的过程,根据伯克利课程说明,加入batch的话会让训练方差减小,提升系统的稳定性。但是因为memory那块使用list做的所以取batch的时候过程相当绕(我发现我现在写python代码还是摆脱不了java的影子啊),希望有大佬给我点建议。

最近看了一些大佬的DDPG的实现(其实都是基于莫凡大佬的那个版本),结合我自己的毕设问题,发现只是用普通的全连接网络好像不太稳定,表现也不好,于是尝试了一下试着用一直对序列数据有强大处理能力的lstm来试试(虽然这个已经有人做过了),自己手动实现了一下基于lstm的ddpg,希望各位大佬指导指导。

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from Env_2_DDPG import Environment

date_count = 5
date_dim = 6
hide_dim = 10
hide_dim_lstm = 100
gamma = 0.8
lr_miu = 0.01
lr_Q = 0.02
tau = 0.01
trans_num = 10
batch_size = 4
MAX_EPISODES = 10
MAX_EP_STEPS = 500
memory_size = 10


class My_loss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return torch.mean(-x)


class A_net(nn.Module):
    def __init__(self):
        super(A_net, self).__init__()
        self.state_dim = date_dim*date_count
        self.net = nn.LSTM(date_dim, hide_dim_lstm)
        self.reg = nn.Linear(date_count*hide_dim_lstm, 1)

    def forward(self, state):
        state = torch.Tensor(state)
        # state = torch.unsqueeze(state, 0)
        x, _ = self.net(state)
        s, b, h = x.shape
        x = x.view(s, b*h)
        x = self.reg(x)
        x = x.view(s, -1)
        return x


class C_net(nn.Module):
    def __init__(self):
        super(C_net, self).__init__()
        self.state_dim = date_count*date_dim
        self.net = nn.LSTM(date_dim, hide_dim_lstm)
        self.lstm_res = nn.Linear(date_count*hide_dim_lstm, date_count)
        self.action_net = nn.Linear(1, date_count)
        self.reg = nn.Linear(date_count, 1)

    def forward(self, state, action):
        state = torch.Tensor(state)
        x1, _ = self.net(state)
        x2 = self.action_net(action)
        s, b, h = x1.shape
        x1 = x1.view(s, b*h)
        x1 = self.lstm_res(x1)
        x = self.reg(x1+x2)
        x = x.view(s, -1)
        return x


class ddpg_lstm(nn.Module):
    def __init__(self):
        super(ddpg_lstm, self).__init__()
        self.miu_net = A_net()
        self.miu_pie = A_net()
        self.Q_net = C_net()
        self.Q_pie = C_net()
        self.optim_miu = optim.SGD(self.miu_net.parameters(), lr=lr_miu, momentum=0.5)
        self.optim_Q = optim.Adam(self.Q_net.parameters(), lr=lr_Q)
        self.loss_Q = nn.MSELoss()
        self.memory = list()
        self.index = 0

    def learn(self, tra):
        s = torch.Tensor(tra[0])
        s = s.reshape(batch_size, date_count, date_dim)
        r = torch.Tensor(tra[1])
        a = torch.Tensor(tra[2])
        s_ = torch.Tensor(tra[3])
        s_ = s_.reshape(batch_size, date_count, date_dim)
        a_ = self.miu_pie(s_)
        y = r + gamma*self.Q_pie(s_, a_)
        # a = torch.Tensor(np.array([a]))
        q = self.Q_net(s, a)

        self.optim_Q.zero_grad()
        q_loss = self.loss_Q(y, q)
        q_loss.backward(retain_graph=True)
        self.optim_Q.step()

        self.optim_miu.zero_grad()
        _miu_loss = My_loss()
        miu_loss = _miu_loss(q)
        miu_loss.backward()
        self.optim_miu.step()

    def soft_update(self):
        self.miu_pie.net.weight.data = tau*self.miu_net.net.weight.data + (1-tau)*self.miu_pie.net.weight.data
        self.miu_pie.reg.weight.data = tau*self.miu_net.reg.weight.data + (1-tau)*self.miu_pie.reg.weight.data
        self.Q_pie.net.weight.data = tau*self.Q_net.net.weight.data + (1-tau)*self.Q_pie.net.weight.data
        self.Q_pie.action_net.weight.data = tau*self.Q_net.action_net.weight.data + (1-tau)*self.Q_pie.action_net.weight.data
        self.Q_pie.reg.weight.data = tau*self.Q_net.reg.weight.data + (1-tau)*self.Q_pie.reg.weight.data

        # for x in self.miu_net.state_dict().keys():
        #     eval('self.miu_net.' + x + '.data.mul_((1-TAU))')
        #     eval('self.miu_net.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        # for x in self.Critic_target.state_dict().keys():
        #     eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
        #     eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

    def store_trans(self, s, r, a, s_):
        temp = list()
        temp.append(s)
        temp.append(r)
        temp.append(a)
        temp.append(s_)
        self.memory.append(temp)
        self.index += 1
        if self.index > trans_num:
            del self.memory[0]

    def train_model(self):
        batch = np.random.choice(memory_size, batch_size)
        bs = np.zeros((batch_size, date_count*date_dim))
        br = np.zeros((batch_size, 1))
        ba = np.zeros((batch_size, 1))
        bs_ = np.zeros((batch_size, date_count*date_dim))
        index_ = 0
        for item in batch:
            bs[index_] = (np.array(self.memory[item][0])).reshape(date_dim*date_count)
            br[index_] = self.memory[item][1]
            ba[index_] = self.memory[item][2]
            bs_[index_] = (np.array(self.memory[item][3])).reshape(date_dim*date_count)
            index_ += 1
            # self.learn(self.memory[item])
        self.learn([bs, br, ba, bs_])
        print('over')
        # self.learn(tra)
            # self.learn()

    def next_action(self, state):
        action = self.miu_net(state)
        return action.detach()


ddpg = ddpg_lstm()
env = Environment()


def train():
    for i in range(MAX_EPISODES):
        # if i > 50:
        #      print(i)
        s = env.reset()
        ep_reward = 0
        for j in range(MAX_EP_STEPS):
            a = ddpg.next_action(s)
            s_, r = env.step(a)
            ddpg.store_trans(s, a, r/10, s_)
            if ddpg.index > memory_size:
                ddpg.train_model()
            s = s_
            ep_reward += r
        if i % 1 == 0 and i > 0:
            print('Episode:', i, ' Reward: %i' % int(ep_reward))
    torch.save(ddpg, 'ddpg2.pt')





需要注意的是我这个没有对数据进行处理,主要针对的是单个数据,还没有针对batch数据,因此在数据送入lstm模型之前手动加了个torch.unsqueeze()强行扩展一个维度。
目前程序处在能跑通的阶段,后续有时间的话继续更新吧。

### LSTM 结合 TD3 算法的框架实现 TD3 是一种改进版本的 DDPG(Deep Deterministic Policy Gradient),旨在解决原始方法中存在的过估计问题以及增加训练稳定性[^1]。LSTM 层可以被引入到策略网络和价值函数中,以便处理时间序列数据并捕捉长期依赖关系。 为了将 LSTM 和 TD3 整合在一起,通常的做法是在构建神经网络模型时,在输入层之后加入一层或多层 LSTM 单元来代替传统的全连接层。这允许代理记住过去的状态信息,并据此做出更明智的动作决策。下面是一个简单的 Python 代码片段展示如何创建这样的混合架构: ```python import torch import torch.nn as nn from stable_baselines3 import TD3 from stable_baselines3.common.policies import ActorCriticPolicy class CustomActor(ActorCriticPolicy): def __init__(self, *args, **kwargs): super(CustomActor, self).__init__(*args, **kwargs) def make_actor(self, features_dim: int): actor_net = nn.Sequential( nn.LSTM(input_size=features_dim, hidden_size=64), nn.ReLU(), nn.Linear(in_features=64, out_features=self.action_space.shape[0]) ) return actor_net def forward(self, obs, state=None, mask=None): lstm_out, _ = self.actor(obs.unsqueeze(0)) action = torch.tanh(lstm_out.squeeze()) return action env = ... # 定义环境 model = TD3(policy=CustomActor, env=env) ``` 此段代码定义了一个自定义演员类 `CustomActor` 继承自 Stable Baselines 的默认政策基类 `ActorCriticPolicy` 并重写了其构造器以包含一个带有单个隐藏层大小为64单元的 LSTM 层作为行动者网络的一部分。注意这里假设观察空间维度等于特征维数(`features_dim`);实际应用可能需要调整这些参数以适应具体任务需求。 对于完整的项目实践指导,可参考 deep-algotrading 提供的学习资源[^2],其中涵盖了从回归分析到循环神经网络再到强化学习的各种技术手段的应用实例。此外,Stable Baselines 文档也提供了详细的 API 参考资料和支持多种高级特性的说明[^3]。
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值