【零基础强化学习】基于玻尔兹曼采样的DQN智能体


更多代码Gitee主页:https://gitee.com/GZHzzz
博客主页CSDN:https://blog.csdn.net/gzhzzaa

写在前面

show me code, no bb

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, N_STATES, N_ACTIONS, N_HIDDEN):
        super(Net, self).__init__()
        # 一个隐层,一个输出层
        self.fc1 = nn.Linear(N_STATES, N_HIDDEN)
        # self.fc1.weight.data.normal_(0, 0.2)
        self.fc2 = nn.Linear(N_HIDDEN, 128)
        # self.fc2.weight.data.normal_(0, 0.2)
        self.fc3 = nn.Linear(128, N_ACTIONS)
        # self.fc3.weight.data.normal_(0, 0.2)
 
    def forward(self, x):
        # Net的执行逻辑 Linear_fc1 --> relu --> fc3 --> actions_value
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        actions_value = self.fc3(x)
        return actions_value

class DQN_agent:
    def __init__(
        self,
        action_num,
        n_features,
        learning_rate = 0.01,
        reward_decay=0.9,
        replace_target_iter=300,
        memory_size=500,
        batch_size=32):

        # torch.cuda.set_device(cuda_device)
        self.n_actions = action_num
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.learn_step_counter = 0
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
        self.eval_net, self.target_net = Net(self.n_features, self.n_actions, 512), Net(self.n_features, self.n_actions, 512)
        # self.eval_net
        # self.target_net
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
        self.loss_func = nn.MSELoss()
        self.flag = 0  # to indicate whether the buffer can provide a batch of data


    def store_transition(self, s, a, r, s_): # 保存一条replay buffer
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1
        self.flag = min(self.flag + 1, self.memory_size)


    def learn(self): # 更新dqn参数
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1
        # 在memory中随机选择batch条buffer
        sample_index = np.random.choice(self.flag, self.batch_size)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :self.n_features])
        b_a = torch.LongTensor(b_memory[:, self.n_features:self.n_features + 1].astype(int))
        b_r = torch.FloatTensor(b_memory[:, self.n_features + 1:self.n_features + 2])
        b_s_ = torch.FloatTensor(b_memory[:, -self.n_features:])
 
        # 计算q_target,loss,反向传递
        q_eval = self.eval_net(b_s).gather(1, b_a)
        q_next = self.target_net(b_s_).detach()
        q_target = b_r + self.gamma * q_next.max(1)[0].view(self.batch_size, 1)
        
        loss = self.loss_func(q_eval, q_target)
 
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def get_blz_action(self, observation, T=5): # 根据epsilon选择最优动作还是随机动作
        observation = torch.unsqueeze(torch.FloatTensor(observation), 0)
        
        actions_value = self.eval_net.forward(observation)
        actions_value = torch.squeeze(actions_value)
        actions_value *=(1/T)
        actions = torch.softmax(actions_value,dim=0)
        action = torch.distributions.Categorical(actions).sample().numpy()
        
        return action

    def get_optimal_action(self, observation): # 选择最优动作
        observation = torch.unsqueeze(torch.FloatTensor(observation), 0)
        actions_value = self.eval_net.forward(observation)
        action = torch.max(actions_value, 1)[1].cpu().data.numpy()
        action = action[0]
        return action


    def save(self, filename):
        torch.save(self.eval_net.state_dict(), filename + "saved_critic")

    def load(self, filename):
        print('load model from: '+filename)
        self.eval_net.load_state_dict(torch.load(filename + "saved_critic"))
        self.target_net.load_state_dict(self.eval_net.state_dict())
  • 代码全部亲自跑过,你懂的!😝

结果展示

  • 改变了传统的随机-贪心策略选择,直接import进入自己的环境就可以用啦!

写在最后

十年磨剑,与君共勉!
更多代码Gitee主页:https://gitee.com/GZHzzz
博客主页CSDN:https://blog.csdn.net/gzhzzaa

  • Fighting!😎

基于pytorch的经典模型基于pytorch的典型智能体模型
强化学习经典论文强化学习经典论文
在这里插入图片描述

while True:
	Go life

在这里插入图片描述

谢谢点赞交流!(❁´◡`❁)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

北郭zz

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值