记忆库
Experience = collections.namedtuple(
'Experience',
field_names=['state', 'action', 'reward', 'done', 'new_state'])
class Memory:
def __init__(self,capacity:int)
def append(self,exp:Experience)
def sample(self,batch_size:int)
神经网络
class Net(nn.Module):
def __init__(self):
def forward(self,x):
DQN模型
class DQN(object):
def __init__(self,n_actions,BATCH_SIZE,MEMORY_CAPYCITY,TARGET_REPLACE_ITER):
eval_net,target_net=Net(),Net()
eval_net.train(),target_net.eval()
memory = Memory(MEMORY_CAPYCITY)
GAMMA=0.9
target_replace_iter
loss_func=nn.SmoothL1Loss()
def choose_action(self,x):
if np.random.uniform()<epsilon:
action_value=self.eval_net.forward(x)
else:
def store_experience(self,exp:Experience):
def learn(self):
if learn_step_counter % TARGET_REPLACE_ITER == 0:
target_net.load_state_dict(eval_net.state_dict())
states, actions, rewards, dones, new_states=self.memory.sample(self.BATCH_SIZE)
q_eval = self.eval_net(states).gather(1,actions.unsqueeze(-1)).squeeze(-1)
q_next = self.target_net(new_states).max(1)[0]
q_target = rewards + GAMMA*q_next
loss = self.loss_func(q_eval,q_target)
loss.backward()
for param in self.eval_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
训练
def preprocess_img(img):
return np.mean(img,axis=2)[100:200,10:150][::2,::2]
for i in range(epochs):
s=env.reset()
input_buf=[]
for _ in range(4):input_buf.append(preprocess_img(s))
while True:
a=dqn.choose_action(input_buf)
for _ in range(4):
s_,r,done,info=env.step(a)
next_input_buf.append(preprocess_img(s_))
exp=Experience(input_buf,a,total_r,done,next_input_buf)
dqn.store_experience(exp)
if dqn.memory_counter>MEMORY_CAPYCITY:
dqn.learn()
if done:
break
input_buf=next_input_buf