代码实现 Human-level control through deep reinforcement learning
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
使用DQN实现 网页:https://www.youtube.com/watch?v=NP8pXZdU-5U&ab_channel=brthorbrthor提示:以下是本篇文章正文内容,下面案例可供参考
一、论文名称?
Title:Human-level control through deep reinforcement learning
doi:10.1038/nature14236
二、代码
代码如下(示例):
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random
GAMMA=0.99 #计算TD目标的折扣率
BATCH_SIZE=32 #the number of ransitions we are going to sample from the replay buffer when we are computing gradients
BUFFER_SIZE=50000 #max number of transitions we are going to stror in the replay buffer before overwriting old transitions
MIN_REPLAY_SIZE=1000 #how many transitions we want in the replay buffer before we start computing gradients and doing training
EPSILON_START=1.0# the starting value of epsilon
EPSILON_END=0.02#
EPSILON_DECAY=10000#the decay period which the epsilon will linearly anneal from EPSILON_START to EPSILON_END over this many steps
TARGET_UPDATE_FREQ=1000#the number of steps where we set the target parameters equal to the online parameters
#create our network class by creting a class which inherits from an nn.module(pytoch)
class Network(nn.Module):
#this is a discrete action space,and continuous action space is different
def __init__(self,env):
super().__init__()
in_features=int(np.prod(env.observation_space.shape))
#use a standard two layer sequential linear network with 64 hidden units
self.net=nn.Sequential(
nn.Linear(in_features,64),
nn.Tanh(),
nn.Linear(64,env.action_space.n))
# reward function
def forward(self,x):
return self.net(x)
def act(self,obs):
#turn obs into a pytorch tensor
obs_t=torch.as_tensor(obs, dtype=torch.float32)
#compute the Q values for this specific observation
q_values = self(obs_t.unsqueeze(0))#let unsqueeze=0,because every single operation in pytorch expects a batch dimension,so w do not use any batch env,we just create a fake batch dimension of size one with the unsqueezed zero
#get the action with the higest q value
max_q_index=torch.argmax(q_values,dim=1)[0]
#in the action we need to turn this pytorch tensor here into an integer
action=max_q_index.detach().item()
return action #we have the action indicee which is just a number between 0 and 1minus the number of actions and that's what we ca return as the action
#create our environment-use CartPole environment because it is an environment we can iterate quicly
env=gym.make('CartPole-v0')
#find out whether or not we have a correct implementation of Deep Qlearning
replay_buffer=deque(maxlen=BUFFER_SIZE)
#create reward buffer where we store the rewardsearned by our agent in a single episode and we do this to trck pretty much the improvement of the agent as it's trains
rew_buffer=deque([0.0], maxlen=100)
#keep track of the reward for this specific episode
episode_reward=0.0
##create our online network and target net
online_net=Network(env)
target_net=Network(env)
#set the target net parameters equal to the online network parameters
target_net.load_state_dict(online_net.state_dict())
optimizer=torch.optim.Adam(online_net.parameters(), lr=5e-4)
#initalize replay buffer
#put transition into replay buffer
obs=env.reset()
#loop min replay size times and select a random action by calling the action space dot sample method
for _ in range(MIN_REPLAY_SIZE):
action=env.action_space.sample()
new_obs, rew, done, _ = env.step(action)
#create transition tuple
transition = (obs,action,rew,done,new_obs)
#stick tha inside of our replay buffer
replay_buffer.append(transition)
#set the obs as the new_obs
obs=new_obs
#if the env needs to be reset-->reset it and get the new pos out of that reset
if done:
obs=env.reset()
#Main training loop
#reset env
obs=env.reset()
#
for step in itertools.count():
#select the action to take in the env(remember we are suing the epsilon greedy policy,so we need to compute epsilon for this step since ie interpolates between epsilon start and end values )
epsilon=np.interp(step, [0,EPSILON_DECAY], [EPSILON_START,EPSILON_END])
#get random samle
rnd_sample=random.random()
if rnd_sample <= epsilon:
action=env.action_space.sample()
else:#intelligently select an action using our network
action=online_net.act(obs)
new_obs, rew, done, _ = env.step(action)
transition = (obs, action, rew, done, new_obs)
replay_buffer.append(transition)
obs = new_obs
episode_reward += rew
if done:
obs = env.reset()
#append the episode reward to the reward buffer and we need to reset the episode reward
rew_buffer.append(episode_reward)
episode_reward=0.0
#After solved watch it play
if len(rew_buffer)>=100:
if np.mean(rew_buffer)>=195:
while True:
action=online_net.act(obs)
obs,_,done,_=env.step(action)
env.render()
if done:
env.reset()
# start gradient step
#batch size number of random transitions from our replay buffer tat we added in ealier
transitions=random.sample(replay_buffer,BATCH_SIZE)
#we get each observation and put it in a list from these transitions we sampled
obses=np.asarray([t[0] for t in transitions])#调用np.asarray是因为pytorch is much faster for making a torch tensor from a numpy array than it is from a pyton array
actions = np.asarray([t[1] for t in transitions])
rews = np.asarray([t[2] for t in transitions])
dones = np.asarray([t[3] for t in transitions])
new_obses = np.asarray([t[4] for t in transitions])
#把上面的元素犬奴转化为pytoch tensor
obses_t = torch.as_tensor(obses,dtype=torch.float32)
actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
rews_t = torch.as_tensor(rews, dtype=torch.float32).unsqueeze(-1)
dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)
#compute targets
#get target q values for the next or the new ops's and this is the first time where we are using the target net
target_q_values=target_net(new_obses_t)
#for each of these new observations we have a set of Q values,we need to collapse target_q_values down to one highest q value per observation
max_target_q_values=target_q_values.max(dim=1,keepdim=True)[0]
#Explaination:(we have a set of q values for each observation and we can say that each observation
# is essentialyy the batch dimension and the q values are dimension one,so we are telling this is
# get the max value in dimension one and discard all the rest and keep that dimension around even
# though there's only one value in it and then we do the zero index because max returns a tuple
# where the first element is the highest values and the second element is the index of those
# values which is equivalent to argmax )
targets=rews_t + GAMMA * (1 - dones_t) * max_target_q_values
#compute loss
q_values=online_net(obses_t)
action_q_values=torch.gather(input=q_values, dim=1, index=actions_t)
loss=nn.functional.smooth_l1_loss(action_q_values, targets)
#Gradient descent
optimizer.zero_grad()
loss.backward()
optimizer.step()
#update the target network
if step % TARGET_UPDATE_FREQ == 0:
target_net.load_state_dict(online_net.state_dict())
#logging
if step % 1000==0:
print()
print('Step',step)
print('Avg Rew',np.mean(rew_buffer))
三、环境
pip install torch gym
四、实验结果