DQN代码详解

自己用的DQN代码,大概理解了一些,随便记记

1.import部分

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque
import random

2.第一个类:构建神经网络

class Net(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Net, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.fc1 = nn.Linear(self.state_dim, 64)
        self.fc1.weight.data.normal_(0, 0.1)
        self.fc2 = nn.Linear(64, 128)
        self.fc2.weight.data.normal_(0, 0.1)
        self.fc3 = nn.Linear(128, self.action_dim)
        self.fc3.weight.data.normal_(0, 0.1)

    def forward(self, x):
        x = x.view(x.size(0), x.size(-1))
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

self,__init__可参考如下:

解惑(一) ----- super(XXX, self).__init__()到底是代表什么含义_奋斗の博客-CSDN博客

自身想法:

2.1.class Net(nn.Module):

   继承nn.Module对象,父类为nn.Module

2.2.__init__(self, state_dim, action_dim)

   self为实例本身,该类下,self近似看为Net(对应的那个实例)

   state_dim, action_dim为网络初始要传入的参数,分别为状态维数与动作维数(两个数)

2.3.super(Net, self).__init__()

    Net类继承父类nn.Module

    super(Net, self).__init__()就是对继承自父类nn.Module的属性进行初始化(这样自己可以少写一些初始化

2.4.forward函数输入变量x为状态(tensor([[1*state_dim]]),原state为向量,加一维度再转为tensor

   网络最终输出x为动作(tensor([[1*action_dim]],grad_fn)

3.第二个类:经验池

class replay_buffer(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=self.capacity)     #关键

    def store(self, state, action, reward, next_state ):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.memory.append([state, action, reward, next_state])


    def sample(self, size):
        batch = random.sample(self.memory, size)
        state, action, reward, next_state = zip(* batch)
        return np.concatenate(state, 0), action, reward, np.concatenate(next_state, 0)

    def __len__(self):
        return len(self.memory)

3.1.class replay_buffer(object):

   继承object对象

3.2.def __init__(self, capacity):

   该类初始化需要知道capacity(经验池容量)这一参数(一数值

   关键部分是创建经验池时,利用了deque(maxlen=self.capacity),类比于list,但可以保证容量满时,最老的那个数值删去,符合经验池的状况。

Python collections模块之deque()详解_chl183的博客-CSDN博客

3.3.def store(self, state, action, reward, next_state ):

np.expand_dims()_hong615771420的博客-CSDN博客

   原state为一list  [],维数扩展后为array  [[]]

3.4.def sample(self, size):

   经验池采样

   batch = random.sample(self.memory, size)

   原memory为一1*N的list,random.sample后变为一1*size的list

   state, action, reward, next_state = zip(* batch)

   输出state为一元组,每一元素均为一array形式的state

   np.concatenate(state, 0)

    numpy数组拼接方法介绍_zyl1042635242的专栏-CSDN博客

   完成数组拼接,将元组变为一array,每一行为一state元素

4.第三个类:DQN


class DQN(object):
    def __init__(self,state_dim,action_dim,learning_rate):
        self.eval_net = Net(state_dim,action_dim)
        self.target_net = Net(state_dim,action_dim)
        self.learn_step_counter = 0

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
        self.loss_fn = nn.MSELoss()
        # 学习率不变

    def get_action(self, state, epsilon,action_dim):
        state = torch.FloatTensor(np.expand_dims(state, 0))
        if random.random() < epsilon:
            action_value = self.eval_net.forward(state)
            action = action_value.max(1)[1].data[0].item()
        else:
            action = random.choice(list(range(action_dim)))
        return action

    def training(self,buffer, batch_size, gamma,target_replace_iter):
        # update the target net
        if self.learn_step_counter % target_replace_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1


        state, action, reward, next_state = buffer.sample(batch_size)
        state = torch.FloatTensor(state)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        next_state = torch.FloatTensor(next_state)

        q_values = self.eval_net.forward(state)
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_net.forward(next_state)
        next_q_value = next_q_values.max(1)[0].detach()
        expected_q_value = reward + next_q_value * gamma

        loss = self.loss_fn(q_value, expected_q_value.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss

    def reset(self,state_dim, action_dim,learning_rate,memory):
        self.eval_net = Net(state_dim, action_dim)
        self.target_net = Net(state_dim, action_dim)
        self.learn_step_counter = 0
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)

        memory.clear()

  • 11
    点赞
  • 48
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
DDPG(Deep Deterministic Policy Gradient)代码实现: ```python import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import numpy as np import random from collections import deque class ReplayBuffer: def __init__(self, buffer_size): self.buffer_size = buffer_size self.buffer = deque(maxlen=buffer_size) def push(self, state, action, reward, next_state, done): experience = (state, action, np.array([reward]), next_state, done) self.buffer.append(experience) def sample(self, batch_size): state_batch = [] action_batch = [] reward_batch = [] next_state_batch = [] done_batch = [] batch = random.sample(self.buffer, batch_size) for experience in batch: state, action, reward, next_state, done = experience state_batch.append(state) action_batch.append(action) reward_batch.append(reward) next_state_batch.append(next_state) done_batch.append(done) return (state_batch, action_batch, reward_batch, next_state_batch, done_batch) def __len__(self): return len(self.buffer) class Actor(nn.Module): def __init__(self, state_dim, action_dim, max_action): super(Actor, self).__init__() self.fc1 = nn.Linear(state_dim, 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, action_dim) self.max_action = max_action def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.max_action * torch.tanh(self.fc3(x)) return x class Critic(nn.Module): def __init__(self, state_dim, action_dim): super(Critic, self).__init__() self.fc1 = nn.Linear(state_dim + action_dim, 256) self.fc2 = nn.Linear(256, 256) self.fc3 = nn.Linear(256, 1) def forward(self, state, action): x = torch.cat([state, action], dim=1) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x class DDPG: def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) self.buffer = ReplayBuffer(buffer_size) self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action def select_action(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(device) action = self.actor(state).cpu().data.numpy().flatten() return action def train(self, batch_size, gamma, tau): state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.buffer.sample(batch_size) state_batch = torch.FloatTensor(state_batch).to(device) action_batch = torch.FloatTensor(action_batch).to(device) reward_batch = torch.FloatTensor(reward_batch).to(device) next_state_batch = torch.FloatTensor(next_state_batch).to(device) done_batch = torch.FloatTensor(done_batch).to(device) next_action_batch = self.actor_target(next_state_batch) next_q_value = self.critic_target(next_state_batch, next_action_batch) q_value = self.critic(state_batch, action_batch) target_q_value = reward_batch + (1 - done_batch) * gamma * next_q_value critic_loss = F.mse_loss(q_value, target_q_value.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ddpg = DDPG(state_dim=4, action_dim=1, max_action=1) batch_size = 128 gamma = 0.99 tau = 0.001 buffer_size = 100000 max_episodes = 1000 max_steps = 1000 for episode in range(max_episodes): state = env.reset() episode_reward = 0 for step in range(max_steps): action = ddpg.select_action(state) next_state, reward, done, _ = env.step(action) ddpg.buffer.push(state, action, reward, next_state, done) if len(ddpg.buffer) > batch_size: ddpg.train(batch_size, gamma, tau) state = next_state episode_reward += reward if done: break print(f"Episode {episode+1} : Episode Reward {episode_reward:.2f}") ``` DQN(Deep Q-Network)代码实现: ```python import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import numpy as np import random from collections import deque class ReplayBuffer: def __init__(self, buffer_size): self.buffer_size = buffer_size self.buffer = deque(maxlen=buffer_size) def push(self, state, action, reward, next_state, done): experience = (state, action, np.array([reward]), next_state, done) self.buffer.append(experience) def sample(self, batch_size): state_batch = [] action_batch = [] reward_batch = [] next_state_batch = [] done_batch = [] batch = random.sample(self.buffer, batch_size) for experience in batch: state, action, reward, next_state, done = experience state_batch.append(state) action_batch.append(action) reward_batch.append(reward) next_state_batch.append(next_state) done_batch.append(done) return (state_batch, action_batch, reward_batch, next_state_batch, done_batch) def __len__(self): return len(self.buffer) class DQN(nn.Module): def __init__(self, state_dim, action_dim): super(DQN, self).__init__() self.fc1 = nn.Linear(state_dim, 128) self.fc2 = nn.Linear(128, 64) self.fc3 = nn.Linear(64, action_dim) def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x class DQNAgent: def __init__(self, state_dim, action_dim): self.q_network = DQN(state_dim, action_dim).to(device) self.target_network = DQN(state_dim, action_dim).to(device) self.target_network.load_state_dict(self.q_network.state_dict()) self.optimizer = optim.Adam(self.q_network.parameters(), lr=1e-3) self.buffer = ReplayBuffer(buffer_size) self.state_dim = state_dim self.action_dim = action_dim def select_action(self, state, eps): if random.random() < eps: action = np.random.uniform(-1, 1, size=self.action_dim) else: state = torch.FloatTensor(state).unsqueeze(0).to(device) q_values = self.q_network(state) action = q_values.argmax().cpu().numpy() return action def train(self, batch_size, gamma): state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.buffer.sample(batch_size) state_batch = torch.FloatTensor(state_batch).to(device) action_batch = torch.LongTensor(action_batch).unsqueeze(-1).to(device) reward_batch = torch.FloatTensor(reward_batch).to(device) next_state_batch = torch.FloatTensor(next_state_batch).to(device) done_batch = torch.FloatTensor(done_batch).to(device) q_values = self.q_network(state_batch).gather(-1, action_batch) next_q_values = self.target_network(next_state_batch).max(-1)[0].detach() target_q_values = reward_batch + gamma * next_q_values * (1 - done_batch) loss = F.mse_loss(q_values, target_q_values.unsqueeze(-1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() for param, target_param in zip(self.q_network.parameters(), self.target_network.parameters()): target_param.data.copy_(param.data) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dqn_agent = DQNAgent(state_dim=4, action_dim=2) buffer_size = 10000 batch_size = 128 gamma = 0.99 max_episodes = 1000 max_steps = 1000 eps_start = 1.0 eps_end = 0.01 eps_decay = 0.995 for episode in range(max_episodes): state = env.reset() episode_reward = 0 eps = eps_end + (eps_start - eps_end) * np.exp(-episode / 200) for step in range(max_steps): action = dqn_agent.select_action(state, eps) next_state, reward, done, _ = env.step(action) dqn_agent.buffer.push(state, action, reward, next_state, done) if len(dqn_agent.buffer) > batch_size: dqn_agent.train(batch_size, gamma) state = next_state episode_reward += reward if done: break print(f"Episode {episode+1} : Episode Reward {episode_reward:.2f}") ``` 以上两份代码都是使用 PyTorch 实现的。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值