待完善。。。
详解:
1.在收集一定数量的的experience后, 随机提取batch size个样本进行Q-learning
def learn(self, model, target_model, memory, gamma, batch_size):
samples = random.sample(memory, batch_size) # shape_samples = (64, 5)
2.将训练需要的元素(state, action,rewards,next_state,dones)按照字段分别提出
states, actions, rewards, next_states, dones = map(np.array, zip(*samples)) #shape_next_states = (64, 4, 84, 84)
print('shape_states =', np.shape(states))
print('shape_actions =', np.shape(actions))
print('shape_next_states =', np.shape(next_states))
print('shape_dones =', np.shape(dones))
#shape_states = (64, 4, 84, 84)
#shape_actions = (64,)
#shape_next_states = (64, 4, 84, 84)
#shape_dones = (64,)
3.
next_Qs = target_model.forward(torch.from_numpy(next_states)) #shape_next_Qs = torch.Size([64, 9])
4.
next_Q = np.amax(next_Qs.detach().numpy(), axis =1) # shape_next_Q = (64,)
5.
print('dones =', dones)
print('np.invert(dones) =', np.invert(dones).astype(np.float))
#dones = [False False...False False False]
#np.invert(dones) = [1. 1. ... 1. 1. 1.]
targets = rewards + np.invert(dones).astype(np.float)*gamma*next_Q
#shape_targets = (64,)
整段代码
def learn(self, model, target_model, memory, gamma, batch_size):
samples = random.sample(memory, batch_size) # shape_samples = (64, 5)
states, actions, rewards, next_states, dones = map(np.array, zip(*samples)) #shape_next_states = (64, 4, 84, 84)
next_Qs = target_model.forward(torch.from_numpy(next_states)) #shape_next_Qs = torch.Size([64, 9])
next_Q = np.amax(next_Qs.detach().numpy(), axis =1) # shape_next_Q = (64,)
print('dones =', dones)
print('np.invert(dones) =', np.invert(dones).astype(np.float))
targets = rewards + np.invert(dones).astype(np.float)*gamma*next_Q