用Double-DQN方法解决简单迷宫寻路问题

环境描述:25*25离散的栅格图,起点左上角,终点右下角,障碍物1*1随机分布,动作空间维度2:向右,向下。每次移动奖励-1,遇到障碍物或移动出环境奖励-100,到终点奖励20.  

 

 注:针对这个任务期望SARSA、Q-learning等方法能够快速有效完成,用DQN方法试了很长时间loss都不收敛,猜测应该是训练不够;DoubleDQN方法也训练了很长时间,关键点在于epsilon探索了不能够下降太快。

代码:

import torch
from torchviz import make_dot, make_dot_from_trace
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from Map import Map
import matplotlib.pyplot as plt
import math
env = Map()
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['observation', 'action', 'reward',
                'next_observation', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)

class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        # 设置全连接层,线性映射,y=xA+b,这里输入[batch-size,N-states]二维张量,输出[batch-size,50]的二维张量
        self.fc1 = nn.Linear(2, 32)
        # initializationz按正态初始化,均值方差(0,0.1)
        self.fc1.weight.data.normal_(0, 0.1)
        # 输入[batch-size,50]二维张量,输出[batch-size,N-states]的二维张量
        self.out = nn.Linear(32, 2)
        self.out.weight.data.normal_(0, 0.1)  # initialization

    # 两层线性网络

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)  # 经过一层网络后再经过激活函数
        actions_value = self.out(x)
        return actions_value  # 动作价值


class DQNAgent:
    def __init__(self, env, gamma=0.95,
             replayer_capacity=10000, batch_size=64):

        self.action_n = env.action_space.n
        self.gamma = gamma
        self.batch_size = batch_size
        self.replayer = DQNReplayer(replayer_capacity) # 经验回放
        self.evaluate_net = Net()
        self.target_net=Net()
        self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.lossfunc=torch.nn.MSELoss()
    def learn(self, observation, action, reward, next_observation, done,count):

        self.replayer.store(observation, action, reward, next_observation,done)# 存储经验

        if count%50==0 :
            self.target_net.load_state_dict(self.evaluate_net.state_dict())

        if count>self.batch_size:
            observations, actions, rewards, next_observations, dones = \
                    self.replayer.sample(self.batch_size) # 经验回放
            batch_s=torch.FloatTensor(observations)
            batch_a=torch.LongTensor(actions)
            batch_r=torch.FloatTensor(rewards)
            batch_s_=torch.FloatTensor(next_observations)
            batch_a=torch.unsqueeze(batch_a,dim=-1)
            batch_r=torch.unsqueeze(batch_r,dim=-1)

            '''q = self.evaluate_net(batch_s_).gather(1, batch_a)  #
            q_target = self.target_net(batch_s_).detach()

            y = batch_r + self.gamma * q_target.max(1)[0].view(self.batch_size, 1)   # shape (batch, 1)'''
            q_next=self.target_net(batch_s_).detach()
            q_eval=self.evaluate_net(batch_s)
            q_target=q_eval.clone()
            batch_index = np.arange(self.batch_size, dtype=np.int32)
            q_target[batch_index, batch_a]=batch_r+self.gamma * q_next.max(1)[0].view(self.batch_size,1)

            loss = self.lossfunc(q_eval,q_target)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        else:
            loss=0
        #if done: # 更新目标网络
            #self.target_net.set_weights(self.evaluate_net.get_weights())

        return loss

    def decide(self, observation,count): # epsilon贪心策略
        #epsilon=cal_epsilon(count)
        epsilon=0.01
        if np.random.rand() < epsilon:
            return np.random.randint(self.action_n)
        s=torch.unsqueeze(torch.FloatTensor(observation), 0)
        actions_value = self.evaluate_net.forward(s)
        # 找出actionsvalue张量每行最大值并返回索引,然后索引的张量放在cpu上再转换成numpy
        action = torch.max(actions_value, 1)[1].data.cpu().numpy()
        action = action[0]
        return action

agent=DQNAgent(env)

class DoubleDQNAgent:
    def __init__(self, env, gamma=0.95,
             replayer_capacity=10000, batch_size=64):

        self.action_n = env.action_space.n
        self.gamma = gamma
        self.batch_size = batch_size
        self.replayer = DQNReplayer(replayer_capacity) # 经验回放
        self.evaluate_net = Net()
        self.target_net=Net()
        self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.lossfunc=torch.nn.MSELoss()
    def learn(self, observation, action, reward, next_observation, done,count):

        self.replayer.store(observation, action, reward, next_observation,done)# 存储经验

        if count%50==0 :
            self.target_net.load_state_dict(self.evaluate_net.state_dict())

        if count>self.batch_size:
            observations, actions, rewards, next_observations, dones = \
                    self.replayer.sample(self.batch_size) # 经验回放
            batch_s=torch.FloatTensor(observations)
            batch_a=torch.LongTensor(actions)
            batch_r=torch.FloatTensor(rewards)
            batch_s_=torch.FloatTensor(next_observations)
            batch_a=torch.unsqueeze(batch_a,dim=-1)
            batch_r=torch.unsqueeze(batch_r,dim=-1)

            '''q = self.evaluate_net(batch_s).gather(1, batch_a)  #
            q_target = self.target_net(batch_s_).detach()

            y = batch_r + self.gamma * q_target.max(1)[0].view(self.batch_size, 1)   # shape (batch, 1)'''

            '''with torch.no_grad:
                q_next=self.target_net(batch_s_).detach()

                action_value=self.evaluate_net(batch_s_)
                action=action_value.max(1)[1].view(self.batch_size,1)
                q_target=q_eval.clone()
                batch_index = np.arange(self.batch_size, dtype=np.int32)
                q_target[batch_index, batch_a]=batch_r+self.gamma * q_next[batch_index,action]'''

            q_eval=self.evaluate_net(batch_s)
            next_eval_qs=self.evaluate_net(batch_s_)
            next_actions=torch.squeeze(next_eval_qs.max(1)[1].view(self.batch_size,1))
            #print(next_actions)
            next_qs=self.target_net(batch_s_).detach()
            next_max_qs=next_qs[np.arange(self.batch_size),next_actions]
            #print(next_max_qs)


            us=torch.LongTensor(rewards)+self.gamma*next_max_qs*(1.-dones)
            us=us.float()
            targets=self.evaluate_net(batch_s)
            #print(us.dtype)
            #print(targets[np.arange(self.batch_size),torch.LongTensor(actions)].dtype)
            targets[np.arange(self.batch_size),actions]=us
            loss = self.lossfunc(q_eval,targets)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        else:
            loss=0
        #if done: # 更新目标网络
            #self.target_net.set_weights(self.evaluate_net.get_weights())

        return loss

    def decide(self, observation,count): # epsilon贪心策略
        epsilon=cal_epsilon(count)
        #epsilon=0.01
        if np.random.rand() < epsilon:
            return np.random.randint(self.action_n)
        s=torch.unsqueeze(torch.FloatTensor(observation), 0)
        actions_value = self.evaluate_net.forward(s)
        # 找出actionsvalue张量每行最大值并返回索引,然后索引的张量放在cpu上再转换成numpy
        action = torch.max(actions_value, 1)[1].data.cpu().numpy()
        action = action[0]
        return action

agent2=DoubleDQNAgent(env)

def cal_epsilon(count):
    epsilon_start=0.8
    epsilon_final=0.3
    epsilon_decay=10000
    epsilon=epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * count / epsilon_decay)
    #epsilon=0
    return epsilon

def play_qlearning(env,agent,count,train=False):
    episode_reward=0
    observation=env.reset()
    actionlist=[]
    while True:
        action=agent.decide(observation,count)
        next_observation,reward,done,_=env.step(action)
        episode_reward+=reward
        actionlist.append(action)
        if train:
            loss=agent.learn(observation,action,reward,next_observation,done,count)

        if done:
            if reward==20:
                sign=True
                final_state=env.state
                return episode_reward,actionlist,sign,loss,final_state
                break
            else:
                sign=False
                final_state=env.state
                return episode_reward,actionlist,sign,loss,final_state
                break
        observation=next_observation
episodes=5000
episode_rewards=[]
book0={}
total_loss=[]
final_state={}
for episode in range(episodes):

    episode_reward,act,flag,L,f_state=play_qlearning(env,agent2,episode+1,train=True)
    if flag==True:
        print('Finished')
        break
    episode_rewards.append(episode_reward)
    total_loss.append(L)
    present_policy= {str(episode):act}
    final_state_present={str(episode):f_state}
    book0.update(present_policy)
    final_state.update(final_state_present)

评论 31
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值