【零基础强化学习】基于玻尔兹曼采样的DQN智能体

最新推荐文章于 2022-11-24 12:17:50 发布

北郭zz

最新推荐文章于 2022-11-24 12:17:50 发布

阅读量308

点赞数

分类专栏：强化学习文章标签：机器学习强化学习人工智能深度学习神经网络

本文链接：https://blog.csdn.net/gzhzzaa/article/details/124204696

版权

强化学习 DQN PyTorch 智能体经验回放缓冲区

关键词由CSDN通过智能技术生成

强化学习专栏收录该内容

19 篇文章 13 订阅

订阅专栏

基于玻尔兹曼采样的DQN智能体🤔

写在前面
show me code, no bb
结果展示
写在最后
- 谢谢点赞交流！(❁´◡`❁)

更多代码： Gitee主页：https://gitee.com/GZHzzz
博客主页： CSDN：https://blog.csdn.net/gzhzzaa

写在前面

作为一个新手，写这个强化学习-基础知识专栏是想和大家分享一下自己强化学习的学习历程，希望大家互相交流一起进步！😁在我的gitee收集了强化学习经典论文：强化学习经典论文，搭建了基于pytorch的典型智能体模型，大家一起多篇多交流，互相学习啊！😊

show me code, no bb

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, N_STATES, N_ACTIONS, N_HIDDEN):
        super(Net, self).__init__()
        # 一个隐层，一个输出层
        self.fc1 = nn.Linear(N_STATES, N_HIDDEN)
        # self.fc1.weight.data.normal_(0, 0.2)
        self.fc2 = nn.Linear(N_HIDDEN, 128)
        # self.fc2.weight.data.normal_(0, 0.2)
        self.fc3 = nn.Linear(128, N_ACTIONS)
        # self.fc3.weight.data.normal_(0, 0.2)
 
    def forward(self, x):
        # Net的执行逻辑 Linear_fc1 --> relu --> fc3 --> actions_value
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        actions_value = self.fc3(x)
        return actions_value

class DQN_agent:
    def __init__(
        self,
        action_num,
        n_features,
        learning_rate = 0.01,
        reward_decay=0.9,
        replace_target_iter=300,
        memory_size=500,
        batch_size=32):

        # torch.cuda.set_device(cuda_device)
        self.n_actions = action_num
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.learn_step_counter = 0
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
        self.eval_net, self.target_net = Net(self.n_features, self.n_actions, 512), Net(self.n_features, self.n_actions, 512)
        # self.eval_net
        # self.target_net
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
        self.loss_func = nn.MSELoss()
        self.flag = 0  # to indicate whether the buffer can provide a batch of data


    def store_transition(self, s, a, r, s_): # 保存一条replay buffer
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1
        self.flag = min(self.flag + 1, self.memory_size)


    def learn(self): # 更新dqn参数
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1
        # 在memory中随机选择batch条buffer
        sample_index = np.random.choice(self.flag, self.batch_size)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :self.n_features])
        b_a = torch.LongTensor(b_memory[:, self.n_features:self.n_features + 1].astype(int))
        b_r = torch.FloatTensor(b_memory[:, self.n_features + 1:self.n_features + 2])
        b_s_ = torch.FloatTensor(b_memory[:, -self.n_features:])
 
        # 计算q_target,loss，反向传递
        q_eval = self.eval_net(b_s).gather(1, b_a)
        q_next = self.target_net(b_s_).detach()
        q_target = b_r + self.gamma * q_next.max(1)[0].view(self.batch_size, 1)
        
        loss = self.loss_func(q_eval, q_target)
 
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def get_blz_action(self, observation, T=5): # 根据epsilon选择最优动作还是随机动作
        observation = torch.unsqueeze(torch.FloatTensor(observation), 0)
        
        actions_value = self.eval_net.forward(observation)
        actions_value = torch.squeeze(actions_value)
        actions_value *=(1/T)
        actions = torch.softmax(actions_value,dim=0)
        action = torch.distributions.Categorical(actions).sample().numpy()
        
        return action

    def get_optimal_action(self, observation): # 选择最优动作
        observation = torch.unsqueeze(torch.FloatTensor(observation), 0)
        actions_value = self.eval_net.forward(observation)
        action = torch.max(actions_value, 1)[1].cpu().data.numpy()
        action = action[0]
        return action


    def save(self, filename):
        torch.save(self.eval_net.state_dict(), filename + "saved_critic")

    def load(self, filename):
        print('load model from: '+filename)
        self.eval_net.load_state_dict(torch.load(filename + "saved_critic"))
        self.target_net.load_state_dict(self.eval_net.state_dict())