强化学习:GPU加速DQN玩倒立摆代码

import random
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
import matplotlib.pyplot as plt
# 使用gpu加速
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
batch_size = 32
learning_rate = 0.01
epsilon = 0.9
gamma = 0.9
target_replace_iter = 100
memory_size = 2000
env = gym.make('CartPole-v0')
env = env.unwrapped
action_size = env.action_space.n
state_size = env.observation_space.shape[0]
env_a_shape = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample().shape
# replay_buffer对应的元素
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state'))


class ReplayBuffer:
    def __init__(self):
        self.capacity = memory_size
        self.memory = []
        self.position = 0
        self.counter = 0

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity
        self.counter += 1

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(state_size, 50)
        self.fc1.weight.data.normal_(0, 0.1)  # initialization
        self.out = nn.Linear(50, action_size)
        self.out.weight.data.normal_(0, 0.1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        action_value = self.out(x)
        return action_value


class DQN:
    def __init__(self):
        super(DQN, self).__init__()
        self.eval_net, self.target_net = Net().to(device), Net().to(device) # 1.GPU加速部分,将网络部分放到GPU中
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
        self.loss_func = nn.MSELoss()
        self.learn_step_counter = 0

    def choose_action(self, x):
        if np.random.uniform() < epsilon:
            action_value = self.eval_net(x)
            action = action_value.max(1)[1].unsqueeze(0)
            action = action[0] if env_a_shape == 0 else action.reshape(env_a_shape)
        else:
            action = np.random.randint(0, action_size)
            action = torch.IntTensor([action]).to(device) # 2.将action放到GPU中
        return action

    def learn_dqn(self):
        if self.learn_step_counter % target_replace_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1
        # 抽样本
        transitions = memory.sample(batch_size=batch_size)
        batch = Transition(*zip(*transitions))
        b_s = torch.cat(batch.state)
        b_a = torch.cat(batch.action).unsqueeze(1)
        b_r = torch.cat(batch.reward).unsqueeze(1)
        b_s_ = torch.cat(batch.next_state)
        # DQN
        q_eval = self.eval_net(b_s).gather(1, b_a)
        q_next = self.target_net(b_s_).detach()
        q_target = b_r + gamma * q_next.max(1)[0].view(batch_size, 1)
        loss = self.loss_func(q_eval, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

if __name__=="__main__":
    dqn = DQN()
    print('DQN start collecting...')
    result_dqn = []
    memory = ReplayBuffer()
    for i_episode in range(400):
        s = env.reset()
        # 将state转换成tensor,并复制张量到gpu
        s = torch.Tensor([s]).to(device) # 3.将状态放到device中
        ep_r = 0
        while True:
            # env.render()
            a = dqn.choose_action(s)
            s_, reward, done, info = env.step(a.data.item())
            # 计算奖励
            x, x_dot, theta, theta_dot = s_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2
            # 将变量转为tensor,同时将设备设置为cuda # 4.将奖励放到GPU中
            r = torch.Tensor([r]).to(device)
            s_ = torch.Tensor([s_]).to(device)
            # 保存交互信息
            memory.push(s, a, r, s_)
            # dqn.store_transition(s, a, r, s_)
            ep_r += reward
            # print(memory.counter)
            if memory.counter > memory_size:
                dqn.learn_dqn()
                if done:
                    result_dqn.append(ep_r)
                    print('Episode: ', i_episode, '|ep_r: ', ep_r)
            if done:
                break
            s = s_

    plt.plot(result_dqn)
    plt.show()

  • 4
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值