DQN,基于Q-learning,结合了神经网络,不再使用Q表格来存储Q值,而是用神经网络拟合的方式,可以大大减少内存的占用,同时也更加省时。
DQN在Q-learning的基础上,又添加了经验池回放和固定Q网络两个新的技巧
LEARN_FREQ = 5
MEMORY_SIZE = 20000
MEMORY_WARMUP_SIZE = 200
BATCH_SIZE = 32
LEARNING_RATE = 0.001
GAMMA = 0.99
class Model(parl.Model):
def __init__(self, obs_dim, act_dim):
super().__init__()
self.fc1 = nn.Linear(obs_dim, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, act_dim)
def forward(self, x):
h1 = F.relu(self.fc1(x))
h2 = F.relu(self.fc2(h1))
Q = self.fc3(h2)
return Q
class Agent(parl.Agent):
def __init__(self, algthrim, act_dim, e_greed=None, e_greed_decrement=None):
super().__init__(algorithm)
self.e_greed = e_greed
self.e_greed_decrement = e_greed_decrement
self.act_dim = act_dim
self.global_steps = 0
self.update_target_steps = 200
def sample(self, obs):
sample1 = np.random.random()
if sample1 > self.e_greed:
act = self.predict(obs)
else:
act = np.random.randint(self.act_dim)
self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)
return act
def predict(self, obs):
obs = paddle.to_tensor(obs, dtype='float32')
predQ = self.alg.predict(obs)
act = paddle.argmax(predQ).numpy()[0]
return act
def learn(self, obs, act, reward, next_obs, terminal):
if self.global_steps % self.update_target_steps == 0:
self.alg.sync_target()
self.global_steps += 1
act = np.expand_dims(act, axis=-1)
reward = np.expand_dims(act, axis=-1)
terminal = np.expand_dims(terminal, axis=-1)
obs = paddle.to_tensor(obs, dtype='float32')
act = paddle.to_tensor(act, dtype='int32')
reward = paddle.to_tensor(reward, dtype='float32')
next_obs = paddle.to_tensor(next_obs, dtype='float32')
terminal = paddle.to_tensor(terminal, dtype&