代码实现 Human-level control through deep reinforcement learning

代码实现 Human-level control through deep reinforcement learning

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


前言

使用DQN实现 网页:https://www.youtube.com/watch?v=NP8pXZdU-5U&ab_channel=brthorbrthor

提示:以下是本篇文章正文内容,下面案例可供参考

一、论文名称?

Title:Human-level control through deep reinforcement learning
doi:10.1038/nature14236

二、代码

代码如下(示例):

from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random

GAMMA=0.99 #计算TD目标的折扣率
BATCH_SIZE=32 #the number of ransitions we are going to sample from the replay buffer when we are computing gradients
BUFFER_SIZE=50000 #max number of transitions we are going to stror in the replay buffer before overwriting old transitions
MIN_REPLAY_SIZE=1000 #how many transitions we want in the replay buffer before we start computing gradients and doing training
EPSILON_START=1.0# the starting value of epsilon
EPSILON_END=0.02#
EPSILON_DECAY=10000#the decay period which the epsilon will linearly anneal from EPSILON_START to EPSILON_END over this many steps
TARGET_UPDATE_FREQ=1000#the number of steps where we set the target parameters equal to the online parameters

#create our network class by creting a class which inherits from an nn.module(pytoch)
class Network(nn.Module):
    #this is  a discrete action space,and continuous action space is different
    def __init__(self,env):
        super().__init__()

        in_features=int(np.prod(env.observation_space.shape))
        #use a standard two layer sequential linear network with 64 hidden units
        self.net=nn.Sequential(
            nn.Linear(in_features,64),
            nn.Tanh(),
            nn.Linear(64,env.action_space.n))
    # reward function
    def forward(self,x):
        return self.net(x)

    def act(self,obs):
        #turn obs into a pytorch tensor
        obs_t=torch.as_tensor(obs, dtype=torch.float32)
        #compute the Q values for this specific observation
        q_values = self(obs_t.unsqueeze(0))#let unsqueeze=0,because every single operation in pytorch expects a batch dimension,so w do not use any batch env,we just create a fake batch dimension of size one with the unsqueezed zero

        #get the action with the higest q value
        max_q_index=torch.argmax(q_values,dim=1)[0]
        #in the action we need to turn this pytorch tensor here into an integer
        action=max_q_index.detach().item()

        return action #we have the action indicee which is just a number between 0 and 1minus the number of actions and that's what we ca return as the action


#create our environment-use CartPole environment because it is an environment we can iterate quicly
env=gym.make('CartPole-v0')

#find out whether or not we have a correct implementation of Deep Qlearning
replay_buffer=deque(maxlen=BUFFER_SIZE)
#create reward buffer where we store the rewardsearned by our agent in a single episode and we do this to trck pretty much the improvement of the agent as it's trains
rew_buffer=deque([0.0], maxlen=100)
#keep track of the reward for this specific episode
episode_reward=0.0

##create our online network and target net
online_net=Network(env)
target_net=Network(env)

#set the target net parameters equal to the online network parameters
target_net.load_state_dict(online_net.state_dict())

optimizer=torch.optim.Adam(online_net.parameters(), lr=5e-4)

#initalize replay buffer
#put transition into replay buffer
obs=env.reset()
#loop min replay size times and select a random action by calling the action space dot sample method
for _ in range(MIN_REPLAY_SIZE):
    action=env.action_space.sample()
    new_obs, rew, done, _ = env.step(action)
    #create transition tuple
    transition = (obs,action,rew,done,new_obs)
    #stick tha inside of our replay buffer
    replay_buffer.append(transition)
    #set the obs as the new_obs
    obs=new_obs

    #if the env needs to be reset-->reset it and get the new pos out of that reset
    if done:
        obs=env.reset()

#Main training loop
#reset env
obs=env.reset()
#
for step in itertools.count():
    #select the action to take in the env(remember we are suing the epsilon greedy policy,so we need to compute epsilon for this step since ie interpolates between epsilon start and end values )
    epsilon=np.interp(step, [0,EPSILON_DECAY], [EPSILON_START,EPSILON_END])
    #get random samle
    rnd_sample=random.random()

    if rnd_sample <= epsilon:
        action=env.action_space.sample()
    else:#intelligently select an action using our network
        action=online_net.act(obs)

    new_obs, rew, done, _ = env.step(action)
    transition = (obs, action, rew, done, new_obs)
    replay_buffer.append(transition)
    obs = new_obs

    episode_reward += rew

    if done:
        obs = env.reset()
        #append the episode reward to the reward buffer and we need to reset the episode reward
        rew_buffer.append(episode_reward)
        episode_reward=0.0

        #After solved watch it play
        if len(rew_buffer)>=100:
            if np.mean(rew_buffer)>=195:
                while True:
                    action=online_net.act(obs)
                    obs,_,done,_=env.step(action)
                    env.render()
                    if done:
                        env.reset()

    # start gradient step
    #batch size number of random transitions from our replay buffer tat we added in ealier
    transitions=random.sample(replay_buffer,BATCH_SIZE)

    #we get each observation and put it in a list from these transitions we sampled
    obses=np.asarray([t[0] for t in transitions])#调用np.asarray是因为pytorch is much faster for making a torch tensor from a numpy array than it is from a pyton array
    actions = np.asarray([t[1] for t in transitions])
    rews = np.asarray([t[2] for t in transitions])
    dones = np.asarray([t[3] for t in transitions])
    new_obses = np.asarray([t[4] for t in transitions])
    #把上面的元素犬奴转化为pytoch tensor
    obses_t = torch.as_tensor(obses,dtype=torch.float32)
    actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
    rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
    dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
    new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)

    #compute targets
    #get target q values for the next or the new ops's and this is the first time where we are using the target net
    target_q_values=target_net(new_obses_t)
    #for each of these new observations we have a set of Q values,we need to collapse target_q_values down to one highest q value per observation
    max_target_q_values=target_q_values.max(dim=1,keepdim=True)[0]
    #Explaination:(we have a set of q values for each observation and we can say that each observation
    # is essentialyy the batch dimension and the q values are dimension one,so we are telling this is
    # get the max value in dimension one and discard all the rest and keep that dimension around even
    # though there's only one value in it and then we do the zero index because max returns a tuple
    # where the first element is the highest values and the second element is the index of those
    # values which is equivalent to argmax )

    targets=rews_t + GAMMA * (1 - dones_t) * max_target_q_values

    #compute loss
    q_values=online_net(obses_t)

    action_q_values=torch.gather(input=q_values, dim=1, index=actions_t)

    loss=nn.functional.smooth_l1_loss(action_q_values, targets)

    #Gradient descent
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    #update the target network
    if step % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(online_net.state_dict())

    #logging
    if step % 1000==0:
        print()
        print('Step',step)
        print('Avg Rew',np.mean(rew_buffer))



三、环境

pip install torch gym

四、实验结果
在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值