基于pytorch的DQN算法实现

参考文章
添加链接描述
(https://www.cnblogs.com/cjnmy36723/p/7018860.html)
(https://www.pythonheidong.com/blog/article/363261/59ae746d690b1ffb13c0/)
(https://blog.csdn.net/weixin_40759186/article/details/87524192)
感谢老师们的文章。

很多文章使用的是gym来实现强化学习算法,这是使用的自己创建的简单环境,如图:
在这里插入图片描述
代码比较简单,直接上代码吧:

import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque


r = np.array([[-1, -1, -1, -1, 0, -1],
              [-1, -1, -1, 0, -1, 100.0],
              [-1, -1, -1, 0, -1, -1],
              [-1, 0, 0, -1, 0, -1],
              [0, -1, -1, 1, -1, 100],
              [-1, 0, -1, -1, 0, 100],
              ])


# 状态数。
state_num = 6
# 动作数。
action_num = 6
# 选取的小批量训练样本数。
BATCH = 20
# epsilon 的最小值,当 epsilon 小于该值时,将不在随机选择行为。
FINAL_EPSILON = 0.0001
# epsilon 的初始值,epsilon 逐渐减小。
INITIAL_EPSILON = 0.1
# epsilon 衰减的总步数。
EXPLORE = 3000000.
# 探索模式计数。
epsilon = 0
# 训练步数统计。
learn_step_counter = 0
# 学习率。
learning_rate = 0.001
# γ经验折损率。
gamma = 0.9
# 记忆上限。
MEMORY_CAPACITY = 5000
# 保存观察到的执行过的行动的存储器,即:曾经经历过的记忆。
replay_memory_store = deque()


class net(nn.Module):
    def __init__(self, s_dim, a_dim):
        super(net, self).__init__()
        self.fc1 = nn.Linear(s_dim, 30)
        self.fc1.weight.data.normal_(0, 0.1)  # initialization
        self.out = nn.Linear(30, a_dim)
        self.out.weight.data.normal_(0, 0.1)  # initialization

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        action_value = self.out(x)
        return action_value


class Dqn(object):
    def __init__(self, s_dim, a_dim):
        self.eval_net = net(s_dim, a_dim)
        self.target_net = net(s_dim, a_dim)
        self.learn_step_counter = learn_step_counter

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
        self.loss_func = nn.MSELoss()

        self.step_index = 0
        self.batch_size = BATCH  # 批量数据的大小
        self.memory_counter = 0  # 当前记忆数。
        self.memory = np.zeros((MEMORY_CAPACITY, 5))
        self.replay_memory_store = replay_memory_store
        self.memory_size = MEMORY_CAPACITY  # 记忆容量
        self.state_list = np.identity(6)
        # 对角线为1的6X6动作矩阵,每一行代表一个状态。
        self.action_list = np.identity(6)
        self.INITIAL_EPSILON = 0.1
        self.FINAL_EPSILON = 0.0001
        self.EXPLORE = 3000000.
        # 训练之前观察多少步。
        self.OBSERVE = 1000.


    def choose_action(self, state_index):

        self.epsilon = self.INITIAL_EPSILON

        current_state = self.state_list[state_index:state_index + 1]
        current_state = torch.FloatTensor(current_state)
        action_value = self.eval_net(current_state).detach().numpy()
        # action_value=action_value.numpy()
        current_action_index = np.argmax(action_value)

        if np.random.uniform() < self.epsilon:
            current_action_index = np.random.randint(0, 5)

        # 开始训练后,在 epsilon 小于一定的值之前,将逐步减小 epsilon。
        if self.step_index > self.OBSERVE and self.epsilon > self.FINAL_EPSILON:
            self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLORE

        return current_action_index

    def store(self, current_state_index, current_action_index, current_reward, next_state_index, done):
        current_state = self.state_list[current_state_index:current_state_index + 1]
        current_action = self.action_list[current_action_index:current_action_index + 1]
        next_state = self.state_list[next_state_index:next_state_index + 1]

        self.replay_memory_store.append((
            current_state,
            current_action,
            current_reward,
            next_state,
            done))

        # 如果超过记忆的容量,则将最久远的记忆移除。
        if len(self.replay_memory_store) > self.memory_size:
            self.replay_memory_store.popleft()

        self.memory_counter += 1

    def train(self):
        current_state = np.random.randint(0, 5)

        while True:
            action = self.choose_action(current_state)

            next_state, reward, done = self.step(current_state, action)

            self.store(current_state, action, reward, next_state, done)
            # 训练之前先观察的步数
            if self.memory_counter > 2000:
                self.learn()

            if self.step_index > 10000:
                break
            if done:
                current_state = np.random.randint(0, 5)
            else:
                current_state = next_state

            self.step_index += 1

    def learn(self):
        if self.learn_step_counter % 100 == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter + 1

        batch = self.batch_size if self.memory_counter > self.batch_size else self.memory_counter
        data_batch = random.sample(self.replay_memory_store, batch)

        batch_state = None
        batch_action = None
        batch_reward = None
        batch_next_state = None
        batch_done = None

        for index in range(len(data_batch)):
            if batch_state is None:
                batch_state = data_batch[index][0]
            elif batch_state is not None:
                batch_state = np.vstack((batch_state, data_batch[index][0]))

            if batch_action is None:
                batch_action = data_batch[index][1]
            elif batch_action is not None:
                batch_action = np.vstack((batch_action, data_batch[index][1]))

            if batch_reward is None:
                batch_reward = data_batch[index][2]
            elif batch_reward is not None:
                batch_reward = np.vstack((batch_reward, data_batch[index][2]))

            if batch_next_state is None:
                batch_next_state = data_batch[index][3]
            elif batch_next_state is not None:
                batch_next_state = np.vstack((batch_next_state, data_batch[index][3]))

            if batch_done is None:
                batch_done = data_batch[index][4]
            elif batch_done is not None:
                batch_done = np.vstack((batch_done, data_batch[index][4]))

        batch_state = torch.FloatTensor(batch_state)
        batch_action = torch.LongTensor(batch_action)
        batch_next_state = torch.FloatTensor(batch_next_state)
        batch_reward = torch.LongTensor(batch_reward)

        y = torch.nonzero(batch_action)
        action = torch.LongTensor(len(y), 1)
        for i in range(len(y)):
            action[i] = y[i][1]
        q_eval = self.eval_net(batch_state).gather(1, action)  # Q估计
        q_next = self.target_net(batch_next_state).detach()
        q_next, i = q_next.max(1)
        q_next = q_next.reshape(len(q_next), 1)  # 选择一个状态最大的Q值

        q_target = torch.FloatTensor(len(batch_reward), 1)
        for i in range(len(batch_reward)):
            current_reward = batch_reward[i][0]

            # q_value = current_reward + gamma * q_next.max(1)[i].view(self.batch_size, 1)
            q_value = current_reward + gamma * q_next[i]

            if current_reward <= -1:
                q_target[i] = current_reward
            else:
                q_target[i] = q_value  # Q现实

        loss = self.loss_func(q_eval, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def step(self, state, action):
        reward = r[state][action]
        next_state = action
        done = False
        if action == 5:
            done = True

        return next_state, reward, done

    def pay(self):
        self.train()
        for index in range(5):
            start_room = index

            print("#############################", "Agent 在", start_room, "开始行动", "#############################")
            current_state = start_room
            step = 0
            target_state = 5

            while current_state != target_state:
                out_result = self.eval_net(torch.FloatTensor(self.state_list[current_state:current_state + 1])).detach()
                out_result = out_result.numpy()
                next_state = np.argmax(out_result[0])
                print(out_result[0])
                print("Agent 由", current_state, "号房间移动到了", next_state, "号房间")
                current_state = next_state
                step += 1

            print("Agent 在", start_room, "号房间开始移动了", step, "步到达了目标房间 5")

            print("#############################", "Agent 在", 5, "结束行动", "#############################")


if __name__ == "__main__":
    s_dim = 6
    a_dim = 6
    dqn = Dqn(s_dim, a_dim)
    dqn.pay()

一篇菜鸡写的博客,有错误还请大家帮忙指出,谢谢。

DQN(Deep Q-Network)算法是一种深度强化学习(Deep Reinforcement Learning)技术,常用于连续动作空间的问题,如控制物理系统的应用。在Python中,可以使用像`TensorFlow`或`PyTorch`这样的库来实现DQN应用于倒立摆控制系统。 首先,你需要了解以下几个关键步骤: 1. **环境模拟**:使用如`gym`(一个流行的开源环境库)中的`Acrobot-v1`环境,它模拟了一个倒立摆动的任务。 2. **网络结构**:构建一个深度神经网络(Q-Network),作为策略评估函数,输入是状态观测,输出是每个可行动作对应的Q值。 3. **经验回放**:存储每一个时间步的经验(观察、动作、奖励、新状态),通过随机采样进行训练,减少数据相关性并稳定训练过程。 4. **训练循环**:在每个时间步,选择一个动作(通常是ε-greedy策略),执行动作并与环境交互,然后更新网络的Q值。使用损失函数(通常是最小化下一时间步的Q值与当前动作的Q值之差的平方)优化网络。 5. **目标网络**:为了稳定性,使用一个固定频率将目标网络(一般为先前一步的网络权重)复制到主网络。 下面是简单的伪代码框架: ```python import gym from keras.models import Model from keras.layers import Dense # 初始化环境和模型 env = gym.make('Acrobot-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n model = build_model(state_size, action_size) # 训练函数 def trainEpisode(): ... # 主循环 for episode in range(num_episodes): state = env.reset() done = False while not done: # 探索与利用 if np.random.rand() < epsilon: action = env.action_space.sample() else: action = model.predict(state)[0].argmax() next_state, reward, done, info = env.step(action) ... # 更新目标网络和主网络 update_target_network(model, target_model) # 打印信息并保存模型 print(f"Episode {episode+1} finished after {timesteps} timesteps") # 关闭环境 env.close() ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值