基于pytorch的DQN算法实现

最新推荐文章于 2024-09-27 09:35:59 发布

景清丶

最新推荐文章于 2024-09-27 09:35:59 发布

阅读量3.7k

点赞数 2

分类专栏：深度强化学习

本文链接：https://blog.csdn.net/Gweixiao/article/details/123438441

版权

强化学习 DQN PyTorch 神经网络迷宫环境

关键词由CSDN通过智能技术生成

深度强化学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

参考文章
添加链接描述
(https://www.cnblogs.com/cjnmy36723/p/7018860.html)
(https://www.pythonheidong.com/blog/article/363261/59ae746d690b1ffb13c0/)
(https://blog.csdn.net/weixin_40759186/article/details/87524192)
感谢老师们的文章。

很多文章使用的是gym来实现强化学习算法，这是使用的自己创建的简单环境，如图：
在这里插入图片描述
代码比较简单，直接上代码吧：

import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque


r = np.array([[-1, -1, -1, -1, 0, -1],
              [-1, -1, -1, 0, -1, 100.0],
              [-1, -1, -1, 0, -1, -1],
              [-1, 0, 0, -1, 0, -1],
              [0, -1, -1, 1, -1, 100],
              [-1, 0, -1, -1, 0, 100],
              ])


# 状态数。
state_num = 6
# 动作数。
action_num = 6
# 选取的小批量训练样本数。
BATCH = 20
# epsilon 的最小值，当 epsilon 小于该值时，将不在随机选择行为。
FINAL_EPSILON = 0.0001
# epsilon 的初始值，epsilon 逐渐减小。
INITIAL_EPSILON = 0.1
# epsilon 衰减的总步数。
EXPLORE = 3000000.
# 探索模式计数。
epsilon = 0
# 训练步数统计。
learn_step_counter = 0
# 学习率。
learning_rate = 0.001
# γ经验折损率。
gamma = 0.9
# 记忆上限。
MEMORY_CAPACITY = 5000
# 保存观察到的执行过的行动的存储器，即：曾经经历过的记忆。
replay_memory_store = deque()


class net(nn.Module):
    def __init__(self, s_dim, a_dim):
        super(net, self).__init__()
        self.fc1 = nn.Linear(s_dim, 30)
        self.fc1.weight.data.normal_(0, 0.1)  # initialization
        self.out = nn.Linear(30, a_dim)
        self.out.weight.data.normal_(0, 0.1)  # initialization

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        action_value = self.out(x)
        return action_value


class Dqn(object):
    def __init__(self, s_dim, a_dim):
        self.eval_net = net(s_dim, a_dim)
        self.target_net = net(s_dim, a_dim)
        self.learn_step_counter = learn_step_counter

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
        self.loss_func = nn.MSELoss()

        self.step_index = 0
        self.batch_size = BATCH  # 批量数据的大小
        self.memory_counter = 0  # 当前记忆数。
        self.memory = np.zeros((MEMORY_CAPACITY, 5))
        self.replay_memory_store = replay_memory_store
        self.memory_size = MEMORY_CAPACITY  # 记忆容量
        self.state_list = np.identity(6)
        # 对角线为1的6X6动作矩阵，每一行代表一个状态。
        self.action_list = np.identity(6)
        self.INITIAL_EPSILON = 0.1
        self.FINAL_EPSILON = 0.0001
        self.EXPLORE = 3000000.
        # 训练之前观察多少步。
        self.OBSERVE = 1000.


    def choose_action(self, state_index):

        self.epsilon = self.INITIAL_EPSILON

        current_state = self.state_list[state_index:state_index + 1]
        current_state = torch.FloatTensor(current_state)
        action_value = self.eval_net(current_state).detach().numpy()
        # action_value=action_value.numpy()
        current_action_index = np.argmax(action_value)

        if np.random.uniform() < self.epsilon:
            current_action_index = np.random.randint(0, 5)

        # 开始训练后，在 epsilon 小于一定的值之前，将逐步减小 epsilon。
        if self.step_index > self.OBSERVE and self.epsilon > self.FINAL_EPSILON:
            self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLORE

        return current_action_index

    def store(self, current_state_index, current_action_index, current_reward, next_state_index, done):
        current_state = self.state_list[current_state_index:current_state_index + 1]
        current_action = self.action_list[current_action_index:current_action_index + 1]
        next_state = self.state_list[next_state_index:next_state_index + 1]

        self.replay_memory_store.append((
            current_state,
            current_action,
            current_reward,
            next_state,
            done))

        # 如果超过记忆的容量，则将最久远的记忆移除。
        if len(self.replay_memory_store) > self.memory_size:
            self.replay_memory_store.popleft()

        self.memory_counter += 1

    def train(self):
        current_state = np.random.randint(0, 5)

        while True:
            action = self.choose_action(current_state)

            next_state, reward, done = self.step(current_state, action)

            self.store(current_state, action, reward, next_state, done)
            # 训练之前先观察的步数
            if self.memory_counter > 2000:
                self.learn()

            if self.step_index > 10000:
                break
            if done:
                current_state = np.random.randint(0, 5)
            else:
                current_state = next_state

            self.step_index += 1

    def learn(self):
        if self.learn_step_counter % 100 == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter + 1

        batch = self.batch_size if self.memory_counter > self.batch_size else self.memory_counter
        data_batch = random.sample(self.replay_memory_store, batch)

        batch_state = None
        batch_action = None
        batch_reward = None
        batch_next_state = None
        batch_done = None

        for index in range(len(data_batch)):
            if batch_state is None:
                batch_state = data_batch[index][0]
            elif batch_state is not None:
                batch_state = np.vstack((batch_state, data_batch[index][0]))

            if batch_action is None:
                batch_action = data_batch[index][1]
            elif batch_action is not None:
                batch_action = np.vstack((batch_action, data_batch[index][1]))

            if batch_reward is None:
                batch_reward = data_batch[index][2]
            elif batch_reward is not None:
                batch_reward = np.vstack((batch_reward, data_batch[index][2]))

            if batch_next_state is None:
                batch_next_state = data_batch[index][3]
            elif batch_next_state is not None:
                batch_next_state = np.vstack((batch_next_state, data_batch[index][3]))

            if batch_done is None:
                batch_done = data_batch[index][4]
            elif batch_done is not None:
                batch_done = np.vstack((batch_done, data_batch[index][4]))

        batch_state = torch.FloatTensor(batch_state)
        batch_action = torch.LongTensor(batch_action)
        batch_next_state = torch.FloatTensor(batch_next_state)
        batch_reward = torch.LongTensor(batch_reward)

        y = torch.nonzero(batch_action)
        action = torch.LongTensor(len(y), 1)
        for i in range(len(y)):
            action[i] = y[i][1]
        q_eval = self.eval_net(batch_state).gather(1, action)  # Q估计
        q_next = self.target_net(batch_next_state).detach()
        q_next, i = q_next.max(1)
        q_next = q_next.reshape(len(q_next), 1)  # 选择一个状态最大的Q值

        q_target = torch.FloatTensor(len(batch_reward), 1)
        for i in range(len(batch_reward)):
            current_reward = batch_reward[i][0]

            # q_value = current_reward + gamma * q_next.max(1)[i].view(self.batch_size, 1)
            q_value = current_reward + gamma * q_next[i]

            if current_reward <= -1:
                q_target[i] = current_reward
            else:
                q_target[i] = q_value  # Q现实

        loss = self.loss_func(q_eval, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def step(self, state, action):
        reward = r[state][action]
        next_state = action
        done = False
        if action == 5:
            done = True

        return next_state, reward, done

    def pay(self):
        self.train()
        for index in range(5):
            start_room = index

            print("#############################", "Agent 在", start_room, "开始行动", "#############################")
            current_state = start_room
            step = 0
            target_state = 5

            while current_state != target_state:
                out_result = self.eval_net(torch.FloatTensor(self.state_list[current_state:current_state + 1])).detach()
                out_result = out_result.numpy()
                next_state = np.argmax(out_result[0])
                print(out_result[0])
                print("Agent 由", current_state, "号房间移动到了", next_state, "号房间")
                current_state = next_state
                step += 1

            print("Agent 在", start_room, "号房间开始移动了", step, "步到达了目标房间 5")

            print("#############################", "Agent 在", 5, "结束行动", "#############################")


if __name__ == "__main__":
    s_dim = 6
    a_dim = 6
    dqn = Dqn(s_dim, a_dim)
    dqn.pay()

一篇菜鸡写的博客，有错误还请大家帮忙指出，谢谢。