DM-DQN: Dueling Munchausen deep Q network for robot path planning（论文阅读笔记待续）

ustc懒苗

已于 2024-02-19 19:52:58 修改

阅读量743

点赞数

文章标签：论文阅读算法人工智能

于 2023-04-01 20:23:55 首次发布

本文链接：https://blog.csdn.net/CBLXXX/article/details/129902415

版权

DM-DQN: Dueling Munchausen deep Q network for robot path planning

Complex & Intelligent Systems
https://github.com/AmazingAng/deep-RL-elements
推荐阅读

文章的目的

achieve collision-free path planning in complex environment,

文章干了什么

使用了DM-DQN算法，该算法来自于M-DQN算法

将网络架构分为value function 和 advantage function ，可以实现加速收敛的效果，综合表现更优
结合了人工势场发的内容

插入一下mdq算法的内容，生成自chatgpt4

文章的主要贡献

用Gazebo仿真平台代替了传统的仿真平台，这个平台的仿真更加接近真实的世界
将网络架构分为value function 和 advantage function ，可以实现加速收敛的效果，综合表现更优
使用人工势场法避障

gazobo的介绍

[ROS技术点滴 —— Gazebo物理仿真平台 - 知乎 (zhihu.com)](https://zhuanlan.zhihu.com/p/50400789#:~:text=gazebo是,一款功能强大的三维物理仿真平台，具备强大的物理引擎、高质量的图形渲染、方便的编程与图形接口，最重要的是其开源免费的特性。)

理论背景

介绍了强化学习的过程

Q-learning的一些知识：

一些参数的含义：

Q指的是状态-动作对的价值函数
gama 折现率
s状态
a行为

缺点：

q函数未知，只能用当前的q代替

M-DQN算法

M-DQN（Multi-Agent Deep Q-Network）算法是一种多智能体强化学习算法，它扩展了经典的Q-Learning算法，使其能够处理多个智能体之间的协作和竞争关系。

在M-DQN算法中，每个智能体都使用深度Q网络（Deep Q-Network）来学习其动作策略，并且每个智能体的动作策略都会受到其他智能体动作策略的影响。为了协调多个智能体的决策，M-DQN算法引入了一个共享的神经网络，该神经网络将所有智能体的状态和动作作为输入，输出每个智能体的Q值函数，以指导其策略的学习。

M-DQN算法的优点包括可以在多个智能体之间实现协作和竞争，而且能够学习复杂的策略，从而提高了智能体的性能。

缺点在于只更新其中一个动作的数值

相对于DQN，引入了‘log-policy’取对数，将概率从1，无穷小。变化为0，负无穷，从而reward变为

上述式子有时候同样无法计算

为了方便理解找了一段dqn算法的例子

展示了基于python3.9 win11环境下运行的python代码段

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import tkinter as tk
from torch import Tensor
from typing import List, Tuple
from torch.optim import Optimizer

# GridWorld environment
class GridWorld:
    def __init__(self, size, target):
        self.size = size
        self.target = target
        self.state = np.zeros((size, size))

    def reset(self, start):
        self.state = np.zeros((self.size, self.size))
        self.state[tuple(start)] = 1
        return start

    def step(self, position, action):
        self.state[tuple(position)] = 0
        new_position = np.array(position) + np.array(action)

        if np.any(new_position < 0) or np.any(new_position >= self.size):
            new_position = position

        self.state[tuple(new_position)] = 1
        reward = 1 if np.all(new_position == self.target) else -1
        done = np.all(new_position == self.target)
        return new_position, reward, done

# M-DQN network
class M_DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(M_DQN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.layers(x)

# Helper functions for M-DQN training
def epsilon_greedy_action(state, epsilon, model):
    if np.random.rand() < epsilon:
        return np.random.randint(0, 4)
    else:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state)
            q_values = model(state_tensor)
            return q_values.argmax().item()



def update_model(
    model: torch.nn.Module,
    target_model: torch.nn.Module,
    memory: List[Tuple[Tensor, int, float, Tensor, bool]],
    optimizer: Optimizer,
    batch_size: int,
    discount_factor: float,
) -> None:
    # If memory size is less than batch size, do not update the model.
    if len(memory) < batch_size:
        return

    # Sample a batch of experiences from the memory.
    states, actions, rewards, next_states, dones = zip(*random.sample(memory, batch_size))

    # Convert the sampled data into PyTorch tensors.
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions).unsqueeze(1)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.BoolTensor(dones)

    # Compute the current Q-values using the model.
    current_q_values = model(states).gather(1, actions)

    # Compute the target Q-values using the target model.
    next_q_values = target_model(next_states).max(1)[0].detach()
    target_q_values = rewards + (~dones) * discount_factor * next_q_values


    # Compute the loss between the current and target Q-values.
    loss_function = torch.nn.MSELoss()
    loss = loss_function(current_q_values, target_q_values.unsqueeze(1))

    # Update the model using the computed loss.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


# GridWorld GUI
class GridWorldGUI(tk.Tk):
    def __init__(self, grid_size, target, delay=100):
        super().__init__()
        self.grid_size = grid_size
        self.target = target
        self.delay = delay
        self.title("M-DQN GridWorld")
        self.canvas = tk.Canvas(self, width=500, height=500, bg="white")
        self.canvas.pack()

    def render(self, state):
        self.canvas.delete("all")
        cell_size = 500 // self.grid_size

        for i in range(self.grid_size):
            for j in range(self.grid_size):
                if (i, j) == tuple(self.target):
                    color = "green"
                elif (i, j) == tuple(state):
                    color = "red"
                else:
                    color = "white"

                self.canvas.create_rectangle(j * cell_size, i * cell_size, (j + 1) * cell_size, (i + 1) * cell_size,
                                             fill=color, outline="black")
            self.update()
            self.after(self.delay)
# Hyperparameters
grid_size = 10
start = [0, 0]
target = [9, 9]
episodes = 500
learning_rate = 0.001
memory_size = 10000
batch_size = 64
discount_factor = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
target_update_freq = 10
# Initialize environment and models
grid_world = GridWorld(grid_size, target)
input_dim = 2
output_dim = 4
model = M_DQN(input_dim, output_dim)
target_model = M_DQN(input_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

memory = []
epsilon = epsilon_start
#Initialize GridWorld GUI
grid_world_gui = GridWorldGUI(grid_size, target)
# Main training loop
for episode in range(episodes):
    state = grid_world.reset(start)
    done = False
    episode_length = 0
    while not done:
        action = epsilon_greedy_action(state, epsilon, model)
        next_state, reward, done = grid_world.step(state, np.array([[-1, 0], [1, 0], [0, -1], [0, 1]])[action])
        memory.append((state, action, reward, next_state, done))
        memory = memory[-memory_size:]

        update_model(model, target_model, memory, optimizer, batch_size, discount_factor)
        state = next_state
        episode_length += 1

        grid_world_gui.render(state)

        if done:
            grid_world_gui.after(1000)  # Pause for 1000ms before starting the next episode

    epsilon = max(epsilon_end, epsilon * epsilon_decay)

    if episode % target_update_freq == 0:
        target_model.load_state_dict(model.state_dict())

    print(f"Episode {episode}, Length: {episode_length}")

引入soft-dqn

DM-DQN

如图所示。DM-DQN分为两个部分。value function 和 advantage function，分别负责未检测和检测到障碍物的情况

两个网络对比

主体算法部分

算法主体部分待续# DM-DQN: Dueling Munchausen deep Q network for robot path planning

Complex & Intelligent Systems