环境描述:25*25离散的栅格图,起点左上角,终点右下角,障碍物1*1随机分布,动作空间维度2:向右,向下。每次移动奖励-1,遇到障碍物或移动出环境奖励-100,到终点奖励20.
注:针对这个任务期望SARSA、Q-learning等方法能够快速有效完成,用DQN方法试了很长时间loss都不收敛,猜测应该是训练不够;DoubleDQN方法也训练了很长时间,关键点在于epsilon探索了不能够下降太快。
代码:
import torch
from torchviz import make_dot, make_dot_from_trace
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from Map import Map
import matplotlib.pyplot as plt
import math
env = Map()
class DQNReplayer:
def __init__(self, capacity):
self.memory = pd.DataFrame(index=range(capacity),
columns=['observation', 'action', 'reward',
'next_observation', 'done'])
self.i = 0
self.count = 0
self.capacity = capacity
def store(self, *args):
self.memory.loc[self.i] = args
self.i = (self.i + 1) % self.capacity
self.count = min(self.count + 1, self.capacity)
def sample(self, size):
indices = np.random.choice(self.count, size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.memory.columns)
class Net(nn.Module):
def __init__(self, ):
super(Net, self).__init__()
# 设置全连接层,线性映射,y=xA+b,这里输入[batch-size,N-states]二维张量,输出[batch-size,50]的二维张量
self.fc1 = nn.Linear(2, 32)
# initializationz按正态初始化,均值方差(0,0.1)
self.fc1.weight.data.normal_(0, 0.1)
# 输入[batch-size,50]二维张量,输出[batch-size,N-states]的二维张量
self.out = nn.Linear(32, 2)
self.out.weight.data.normal_(0, 0.1) # initialization
# 两层线性网络
def forward(self, x):
x = self.fc1(x)
x = F.relu(x) # 经过一层网络后再经过激活函数
actions_value = self.out(x)
return actions_value # 动作价值
class DQNAgent:
def __init__(self, env, gamma=0.95,
replayer_capacity=10000, batch_size=64):
self.action_n = env.action_space.n
self.gamma = gamma
self.batch_size = batch_size
self.replayer = DQNReplayer(replayer_capacity) # 经验回放
self.evaluate_net = Net()
self.target_net=Net()
self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), lr=0.001)
self.lossfunc=torch.nn.MSELoss()
def learn(self, observation, action, reward, next_observation, done,count):
self.replayer.store(observation, action, reward, next_observation,done)# 存储经验
if count%50==0 :
self.target_net.load_state_dict(self.evaluate_net.state_dict())
if count>self.batch_size:
observations, actions, rewards, next_observations, dones = \
self.replayer.sample(self.batch_size) # 经验回放
batch_s=torch.FloatTensor(observations)
batch_a=torch.LongTensor(actions)
batch_r=torch.FloatTensor(rewards)
batch_s_=torch.FloatTensor(next_observations)
batch_a=torch.unsqueeze(batch_a,dim=-1)
batch_r=torch.unsqueeze(batch_r,dim=-1)
'''q = self.evaluate_net(batch_s_).gather(1, batch_a) #
q_target = self.target_net(batch_s_).detach()
y = batch_r + self.gamma * q_target.max(1)[0].view(self.batch_size, 1) # shape (batch, 1)'''
q_next=self.target_net(batch_s_).detach()
q_eval=self.evaluate_net(batch_s)
q_target=q_eval.clone()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, batch_a]=batch_r+self.gamma * q_next.max(1)[0].view(self.batch_size,1)
loss = self.lossfunc(q_eval,q_target)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
else:
loss=0
#if done: # 更新目标网络
#self.target_net.set_weights(self.evaluate_net.get_weights())
return loss
def decide(self, observation,count): # epsilon贪心策略
#epsilon=cal_epsilon(count)
epsilon=0.01
if np.random.rand() < epsilon:
return np.random.randint(self.action_n)
s=torch.unsqueeze(torch.FloatTensor(observation), 0)
actions_value = self.evaluate_net.forward(s)
# 找出actionsvalue张量每行最大值并返回索引,然后索引的张量放在cpu上再转换成numpy
action = torch.max(actions_value, 1)[1].data.cpu().numpy()
action = action[0]
return action
agent=DQNAgent(env)
class DoubleDQNAgent:
def __init__(self, env, gamma=0.95,
replayer_capacity=10000, batch_size=64):
self.action_n = env.action_space.n
self.gamma = gamma
self.batch_size = batch_size
self.replayer = DQNReplayer(replayer_capacity) # 经验回放
self.evaluate_net = Net()
self.target_net=Net()
self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), lr=0.001)
self.lossfunc=torch.nn.MSELoss()
def learn(self, observation, action, reward, next_observation, done,count):
self.replayer.store(observation, action, reward, next_observation,done)# 存储经验
if count%50==0 :
self.target_net.load_state_dict(self.evaluate_net.state_dict())
if count>self.batch_size:
observations, actions, rewards, next_observations, dones = \
self.replayer.sample(self.batch_size) # 经验回放
batch_s=torch.FloatTensor(observations)
batch_a=torch.LongTensor(actions)
batch_r=torch.FloatTensor(rewards)
batch_s_=torch.FloatTensor(next_observations)
batch_a=torch.unsqueeze(batch_a,dim=-1)
batch_r=torch.unsqueeze(batch_r,dim=-1)
'''q = self.evaluate_net(batch_s).gather(1, batch_a) #
q_target = self.target_net(batch_s_).detach()
y = batch_r + self.gamma * q_target.max(1)[0].view(self.batch_size, 1) # shape (batch, 1)'''
'''with torch.no_grad:
q_next=self.target_net(batch_s_).detach()
action_value=self.evaluate_net(batch_s_)
action=action_value.max(1)[1].view(self.batch_size,1)
q_target=q_eval.clone()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, batch_a]=batch_r+self.gamma * q_next[batch_index,action]'''
q_eval=self.evaluate_net(batch_s)
next_eval_qs=self.evaluate_net(batch_s_)
next_actions=torch.squeeze(next_eval_qs.max(1)[1].view(self.batch_size,1))
#print(next_actions)
next_qs=self.target_net(batch_s_).detach()
next_max_qs=next_qs[np.arange(self.batch_size),next_actions]
#print(next_max_qs)
us=torch.LongTensor(rewards)+self.gamma*next_max_qs*(1.-dones)
us=us.float()
targets=self.evaluate_net(batch_s)
#print(us.dtype)
#print(targets[np.arange(self.batch_size),torch.LongTensor(actions)].dtype)
targets[np.arange(self.batch_size),actions]=us
loss = self.lossfunc(q_eval,targets)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
else:
loss=0
#if done: # 更新目标网络
#self.target_net.set_weights(self.evaluate_net.get_weights())
return loss
def decide(self, observation,count): # epsilon贪心策略
epsilon=cal_epsilon(count)
#epsilon=0.01
if np.random.rand() < epsilon:
return np.random.randint(self.action_n)
s=torch.unsqueeze(torch.FloatTensor(observation), 0)
actions_value = self.evaluate_net.forward(s)
# 找出actionsvalue张量每行最大值并返回索引,然后索引的张量放在cpu上再转换成numpy
action = torch.max(actions_value, 1)[1].data.cpu().numpy()
action = action[0]
return action
agent2=DoubleDQNAgent(env)
def cal_epsilon(count):
epsilon_start=0.8
epsilon_final=0.3
epsilon_decay=10000
epsilon=epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * count / epsilon_decay)
#epsilon=0
return epsilon
def play_qlearning(env,agent,count,train=False):
episode_reward=0
observation=env.reset()
actionlist=[]
while True:
action=agent.decide(observation,count)
next_observation,reward,done,_=env.step(action)
episode_reward+=reward
actionlist.append(action)
if train:
loss=agent.learn(observation,action,reward,next_observation,done,count)
if done:
if reward==20:
sign=True
final_state=env.state
return episode_reward,actionlist,sign,loss,final_state
break
else:
sign=False
final_state=env.state
return episode_reward,actionlist,sign,loss,final_state
break
observation=next_observation
episodes=5000
episode_rewards=[]
book0={}
total_loss=[]
final_state={}
for episode in range(episodes):
episode_reward,act,flag,L,f_state=play_qlearning(env,agent2,episode+1,train=True)
if flag==True:
print('Finished')
break
episode_rewards.append(episode_reward)
total_loss.append(L)
present_policy= {str(episode):act}
final_state_present={str(episode):f_state}
book0.update(present_policy)
final_state.update(final_state_present)