强化学习-DQN实现

最新推荐文章于 2024-08-20 16:20:53 发布

geniuscrh

最新推荐文章于 2024-08-20 16:20:53 发布

阅读量84

点赞数

文章标签： python 深度学习开发语言

本文链接：https://blog.csdn.net/geniuscrh/article/details/129878180

版权

记忆库

#定义类型
Experience = collections.namedtuple(  
	'Experience',
    field_names=['state', 'action', 'reward',  'done', 'new_state'])  

class Memory:
    #初始化，设置最大存储
    def __init__(self,capacity:int)
    #增加数据
    def append(self,exp:Experience)
    #随机读取数据
    def sample(self,batch_size:int)

神经网络

class Net(nn.Module):
    #初始化
    def __init__(self):      
    #向前传播
    def forward(self,x):

DQN模型

class DQN(object):
    #初始化
    def __init__(self,n_actions,BATCH_SIZE,MEMORY_CAPYCITY,TARGET_REPLACE_ITER):
        eval_net，target_net=Net()，Net()#定义eval及target网络
        eval_net.train()，target_net.eval()#eval学习，target不学习
        
        memory = Memory(MEMORY_CAPYCITY)#初始化记忆库
        
        GAMMA=0.9#学习率
        target_replace_iter#替换target数字的频次
        
        
        loss_func=nn.SmoothL1Loss()#损失值函数
        
    #随机选择动作：相关参数：epsilon:0.9
    def choose_action(self,x):
        if np.random.uniform()<epsilon:
            #进入神经网络，选择最大action
            action_value=self.eval_net.forward(x)
        else：
        	#随机选择action
            
    #保存经验
    def store_experience(self,exp:Experience):
    #学习
    def learn(self):
        #达到target频次，更新target_net网络权值
        if learn_step_counter % TARGET_REPLACE_ITER == 0:
            target_net.load_state_dict(eval_net.state_dict())
        #读取记忆库
        states, actions, rewards, dones, new_states=self.memory.sample(self.BATCH_SIZE)
        
        #预测：获取action对应的eval_net数值
        q_eval = self.eval_net(states).gather(1,actions.unsqueeze(-1)).squeeze(-1)
        #现实：R+学习率*q_next
        q_next = self.target_net(new_states).max(1)[0]
        q_target = rewards +  GAMMA*q_next
        #损失值
        loss = self.loss_func(q_eval,q_target)
        
        #向后传播
        loss.backward()
        #限制eval_net网络的范围
        for param in self.eval_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

训练

#优化环境画面
def preprocess_img(img):
    return np.mean(img,axis=2)[100:200,10:150][::2,::2]

#游戏训练过程
for i in range(epochs):
   		#初始化
        s=env.reset()
        #添加4次环境画面
        input_buf=[]
        for _ in range(4):input_buf.append(preprocess_img(s))
      
        while True:
            #选择一个动作
            a=dqn.choose_action(input_buf)
            for _ in range(4):
                #获取环境反馈
                s_,r,done,info=env.step(a)
                next_input_buf.append(preprocess_img(s_))
                #优化r，提升学习效率
                #待补充~~~~
            
            #存储记忆
            exp=Experience(input_buf,a,total_r,done,next_input_buf)
            dqn.store_experience(exp)
            #大于门限开始学习
            if dqn.memory_counter>MEMORY_CAPYCITY:
                dqn.learn()
            #游戏结束
            if done:
                break
            #进入下次学习
            input_buf=next_input_buf