作业一主要是熟悉提交的框架,这里就不再详述,采用的算法也是random算法。
一、作业二
1.1QLearning
离线学习,下一个动作由之前的策略选择。
import pandas as pd
import numpy as np
import gym
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
### 认为下一个动作不是由当前的策略来选择得到的
class CliffWalkingEnv:
def __init__(self, ncol, nrow):
self.nrow = nrow
self.ncol = ncol
self.x = 0 # 记录当前智能体位置的横坐标
self.y = self.nrow - 1 # 记录当前智能体位置的纵坐标
def step(self, action): # 外部调用这个函数来让当前位置改变
change = [[0, -1], [0, 1], [-1, 0], [1, 0]] # 4 种动作, 0:上, 1:下, 2:左, 3:右。原点(0,0)定义在左上角
self.x = min(self.ncol - 1, max(0, self.x + change[action][0]))
self.y = min(self.nrow - 1, max(0, self.y + change[action][1]))
next_state = self.y * self.ncol + self.x
reward = -1
done = False
if self.y == self.nrow - 1 and self.x > 0: # 下一个位置在悬崖或者终点
done = True
if self.x != self.ncol - 1:
reward = -100
return next_state, reward, done
def reset(self): # 回归初始状态,坐标轴原点在左上角
self.x = 0
self.y = self.nrow - 1
return self.y * self.ncol + self.x
class QLearning:
def __init__(self,state_num,action_num,gammma,alpha,e_greedy):
self.q_table=np.zeros([state_num,action_num])
self.gamma=gammma
self.alpha=alpha
self.e_greedy=e_greedy
self.action_num=action_num
def choose_action(self,state):
if(np.random.random() > self.e_greedy):
action=np.random.randint(self.action_num)
else:
action=np.argmax(self.q_table[state])
return action
def learn(self,c_state,c_action,n_state,r):
td_error=r+self.gamma*self.q_table[n_state].max()-self.q_table[c_state,c_action]
self.q_table[c_state,c_action]+=self.alpha*td_error
ncol=12
nrow=4
env = CliffWalkingEnv(ncol, nrow)
epsilon = 0.9
alpha = 0.1
gamma = 0.9
Episode=600
action_num=4
state_num=48
np.random.seed(0)
agent = QLearning(state_num, action_num, gamma, alpha, epsilon)
num_episodes=1000
return_list = [] # 记录每一条序列的回报
for i in range(10): # 显示10个进度条
with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar: # tqdm的进度条功能
for i_episode in range(int(num_episodes/10)): # 每个进度条的序列数
episode_return = 0
state = env.reset()
done = False
while not done:
action = agent.choose_action(state)
next_state, reward, done = env.step(action)
episode_return += reward # 这里回报的计算不进行折扣因子衰减
agent.learn(state, action,next_state,reward)
state = next_state
return_list.append(episode_return)
if (i_episode+1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报
pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
pbar.update(1)
episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('Q-learning on {}'.format('Cliff Walking'))
plt.show()
with open('q_learning.pth', 'wb') as f:
pickle.dump(agent.q_table, f)
1.2 Sarsa算法
在线学习,与Qlearning不同,它的下一步的动作是由当前的策略选择的。
class Sarsa:
def __init__(self,state_num,action_num,gammma,alpha,e_greedy):
self.q_table=np.zeros([state_num,action_num])
self.gamma=gammma
self.alpha=alpha
self.e_greedy=e_greedy
self.action_num=action_num
def choose_action(self,state):
if(np.random.random() <self.e_greedy):
action=np.random.randint(self.action_num)
else:
action=np.argmax(self.q_table[state])
return action
def learn(self,c_state,c_action,n_state,n_action,r):
td_error=r+self.gamma*self.q_table[n_state,n_action]-self.q_table[c_state,c_action]
self.q_table[c_state,c_action]+=self.alpha*td_error
1.3 n_step Sarsa
class nstep_Sarsa:
def __init__(self,state_num,action_num,gammma,alpha,e_greedy,n_step):
self.q_table=np.zeros([state_num,action_num])
self.action_num=action_num
self.gamma=gammma
self.alpha=alpha
self.e_greedy=e_greedy
self.n_step=n_step
self.state_list=[]
self.action_list=[]
self.reward_list=[]
def choose_action(self,state):
if(np.random.random()<self.e_greedy):
action=np.random.randint(self.action_num)
else:
action=np.argmax(self.q_table[state])
return action
def learn(self,c_state,c_action,n_state,n_action,r,done):
self.state_list.append(c_state)
self.action_list.append(c_action)
self.reward_list.append(r)
if len(self.state_list) == self.n_step: # 若保存的数据可以进行n步更新
G = self.q_table[n_state, n_action] # 得到Q(s_{t+n}, a_{t+n})
for i in reversed(range(self.n_step)):
G = self.gamma * G + self.reward_list[i] # 不断向前计算每一步的回报
if done and i > 0: # 如果到达终止状态,最后几步虽然长度不够n步,我们也对其进行更新
s = self.state_list[i]
a = self.action_list[i]
self.q_table[s, a] += self.alpha * (G - self.q_table[s, a])
s = self.state_list.pop(0) # 需要更新的状态动作从列表中删除,下次不必更新
a = self.action_list.pop(0)
self.reward_list.pop(0)
self.q_table[s, a] += self.alpha * (G - self.q_table[s, a]) # n-step sarsa的主要更新步骤
if done: # 如果到达终止状态,即将开始下一条序列,则将列表全清空
self.state_list = []
self.action_list = []
self.reward_list = []
1.4 修改的Sarsa
增加了对于探索的奖励。
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm # tqdm是显示循环进度条的库
## add exploration reward
class nstep_Sarsachange:
""" n步Sarsa算法 """
def __init__(self, n, ncol, nrow, epsilon, alpha, gamma, n_action=4):
self.Q_table = np.zeros([nrow * ncol, n_action])
self.n_action = n_action
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.try_state=[]
self.n = n # 采用n步Sarsa
self.state_list = [] # 保存之前的状态
self.action_list = [] # 保存之前的动作
self.reward_list = [] # 保存之前的奖励
def take_action(self, state):
if np.random.random() < self.epsilon:
action = np.random.randint(self.n_action)
else:
action = np.argmax(self.Q_table[state])
return action
def best_action(self, state): # 用于打印策略
Q_max = np.max(self.Q_table[state])
a = [0 for _ in range(self.n_action)]
for i in range(self.n_action):
if self.Q_table[state, i] == Q_max:
a[i] = 1
return a
def update(self, s0, a0, r, s1, a1, done):
self.state_list.append(s0)
self.action_list.append(a0)
self.reward_list.append(r)
penelrewar=0
if(s0 not in self.try_state):
penelrewar=0.2
self.try_state.append(s0)
if len(self.state_list) == self.n: # 若保存的数据可以进行n步更新
G = self.Q_table[s1, a1] # 得到Q(s_{t+n}, a_{t+n})
for i in reversed(range(self.n)):
G = self.gamma * G + self.reward_list[i] +penelrewar # 不断向前计算每一步的回报
if done and i > 0: # 如果到达终止状态,最后几步虽然长度不够n步,我们也对其进行更新
s = self.state_list[i]
a = self.action_list[i]
self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a])
s = self.state_list.pop(0) # 需要更新的状态动作从列表中删除,下次不必更新
a = self.action_list.pop(0)
self.reward_list.pop(0)
self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a]) # n-step sarsa的主要更新步骤
if done: # 如果到达终止状态,即将开始下一条序列,则将列表全清空
self.state_list = []
self.action_list = []
self.reward_list = []
二、作业三
2.1 DQN 算法
对于连续状态的环境,利用表的方式难以存储对应的状态动作,所以采用神经网络来进行估计动作价值。
import numpy as np
import gym
import matplotlib.pyplot as plt
import torch
import tqdm
import tensorflow
import collections
import random
import torch.nn.functional as F
##经验池
class MemoryPool:
def __init__(self,capacity):
self.memory=collections.deque(maxlen=capacity) ##创建一个队列
##采样
def sample(self,batch_szie):
transition=random.sample(self.memory,batch_szie) ##采样采出来是个列表,随机挑选n条数据
c_state,c_action,reward,next_state,done=zip(*transition) #解包,用元组存储
return np.array(c_state),c_action,reward,np.array(next_state),done
##添加记忆
def add(self,c_state,c_action,reward,n_state,done):
self.memory.append((c_state,c_action,reward,n_state,done))
## 返回长度
def size(self):
return len(self.memory)
##建立神经网络
class Q_net(torch.nn.Module):
def __init__(self,state_num,hidden_num,action_num):
super(Q_net,self).__init__()
self.fc1=torch.nn.Linear(state_num,hidden_num)
self.fc2=torch.nn.Linear(hidden_num,action_num)
## 前向传播
def forward(self,x): ##前向传播计算q值
x=F.relu(self.fc1(x)) ##只用一层激活函数
return self.fc2(x)
class DQN:
##初始化
def __init__(self,state_num,action_num,hidden_num,gamma,alpha,e_greedy,upgrade_step,device):
self.action_num=action_num
self.q_eval_net=Q_net(state_num,hidden_num,action_num)
self.q_target_net=Q_net(state_num,hidden_num,action_num)
self.gamma=gamma
self.alpha=alpha
self.e_greedy=e_greedy
self.count=0
self.upgrad_estep=upgrade_step
self.optimizer=torch.optim.Adam(self.q_eval_net.parameters(),lr=self.alpha)
##选择动作
def choose_action(self,state):
if(np.random.random()<self.e_greedy):
action=np.random.randint(self.action_num)
else:
state=torch.tensor([state],dtype=torch.float)
action=self.q_eval_net(state).argmax().item()
return action
##学习
def learn(self,trainsion):
states=torch.tensor(trainsion['state'],dtype=torch.float)
actions=torch.tensor(trainsion['action']).view(-1,1)
##view(-1,1) 意味着不知道几行,但需要一列,类似于reshape
rewards=torch.tensor(trainsion['reward'],dtype=torch.float).view(-1,1)
dones=torch.tensor(trainsion['done'],dtype=torch.float).view(-1,1)
n_states=torch.tensor(trainsion['next_state'],dtype=torch.float)
q_eval=self.q_eval_net(states).gather(1,actions) ##得到q值
max_next_q_value=self.q_target_net(n_states).max(1)[0].view(-1,1)
target_q=rewards+self.gamma*max_next_q_value*(1-dones) ##如果当前是结束状态,那么reward就是最后的reward
dqn_loss=torch.mean(F.mse_loss(target_q,q_eval)) ##均方误差 mean 是求平均值,mseloss 均方误差 (这两个值是个向量)
self.optimizer.zero_grad() ##将梯度置零
dqn_loss.backward() ##反向传播求梯度
self.optimizer.step() ##更新参数
if(self.count%self.upgrad_estep==0):
self.q_target_net.load_state_dict(self.q_eval_net.state_dict()) ##更新目标网络
self.count+=1
env_name="CartPole-v0"
env=gym.make(env_name)
Episode_num=1000
memory_capacity=10000
memeory=MemoryPool(memory_capacity)
upgrade_step=500
batch_size=64
hidden_dim=128
alpha=2e-3
action_num=env.action_space.n
state_num = env.observation_space.shape[0]
gamma=0.99
e_greedy=0.01
device = torch.device("cpu")
random.seed(0)
env.seed(0)
minimal_size = 500
np.random.seed(0)
torch.manual_seed(0)
agent=DQN(state_num,action_num,hidden_dim,gamma,alpha,e_greedy,upgrade_step,device)
for i in range(Episode_num):
state=env.reset()
episode_return=0 ##片段返回的回报
done=False
while not done:
action=agent.choose_action(state)
next_state,reward,done,_=env.step(action)
memeory.add(state,action,reward,next_state,done)
state=next_state
episode_return+=reward
if(memeory.size()>minimal_size): ##memory 中的数据量大于最低限度
b_s,b_a,b_r,b_ns,b_d=memeory.sample(batch_size)
transmition={'state':b_s,'action':b_a,'reward':b_r,'next_state':b_ns,'done':b_d}
agent.learn(transmition)
print(episode_return)
2.2 DDQN 算法
DQN中的对于下一个动作的选取与估计都采用的是target network ,容易出现过高的估计。
所以我们这里采用eval network去选择动作,用target network 去估计该动作的价值来减少估计的偏高。
import numpy as np
import gym
import matplotlib.pyplot as plt
import torch
import tqdm
import tensorflow
import collections
import random
import torch.nn.functional as F
##经验池
class MemoryPool:
def __init__(self,capacity):
self.memory=collections.deque(maxlen=capacity) ##创建一个队列
##采样
def sample(self,batch_szie):
transition=random.sample(self.memory,batch_szie) ##采样采出来是个列表,随机挑选n条数据
c_state,c_action,reward,next_state,done=zip(*transition) #解包,用元组存储
return np.array(c_state),c_action,reward,np.array(next_state),done
##添加记忆
def add(self,c_state,c_action,reward,n_state,done):
self.memory.append((c_state,c_action,reward,n_state,done))
## 返回长度
def size(self):
return len(self.memory)
##建立神经网络
class Q_net(torch.nn.Module):
def __init__(self,state_num,hidden_num,action_num):
super(Q_net,self).__init__()
self.fc1=torch.nn.Linear(state_num,hidden_num)
self.fc2=torch.nn.Linear(hidden_num,action_num)
## 前向传播
def forward(self,x): ##前向传播计算q值
x=F.relu(self.fc1(x)) ##只用一层激活函数?
return self.fc2(x)
class DQN:
##初始化
def __init__(self,state_num,action_num,hidden_num,gamma,alpha,e_greedy,upgrade_step,device):
self.action_num=action_num
self.q_eval_net=Q_net(state_num,hidden_num,action_num)
self.q_target_net=Q_net(state_num,hidden_num,action_num)
self.gamma=gamma
self.alpha=alpha
self.e_greedy=e_greedy
self.count=0
self.upgrad_estep=upgrade_step
self.optimizer=torch.optim.Adam(self.q_eval_net.parameters(),lr=self.alpha)
##选择动作
def choose_action(self,state):
if(np.random.random()<self.e_greedy):
action=np.random.randint(self.action_num)
else:
state=torch.tensor([state],dtype=torch.float)
action=self.q_eval_net(state).argmax().item()
return action
##学习
def learn(self,trainsion):
states=torch.tensor(trainsion['state'],dtype=torch.float)
actions=torch.tensor(trainsion['action']).view(-1,1)
##view(-1,1) 意味着不知道几行,但需要一列,类似于reshape
rewards=torch.tensor(trainsion['reward'],dtype=torch.float).view(-1,1)
dones=torch.tensor(trainsion['done'],dtype=torch.float).view(-1,1)
n_states=torch.tensor(trainsion['next_state'],dtype=torch.float)
q_eval=self.q_eval_net(states).gather(1,actions) ##得到q值
max_action = self.q_eval_net(n_states).max(1)[1].view(-1, 1) ##训练网路去选取下一个动作
max_next_q_value=self.q_target_net(n_states).gather(1,max_action)##计算还是用target,减少了过高估计
target_q=rewards+self.gamma*max_next_q_value*(1-dones) ##如果当前是结束状态,那么reward就是最后的reward
dqn_loss=torch.mean(F.mse_loss(target_q,q_eval)) ##均方误差 mean 是求平均值,mseloss 均方误差 (这两个值是个向量)
self.optimizer.zero_grad() ##将梯度置零
dqn_loss.backward() ##反向传播求梯度
self.optimizer.step() ##更新参数
if(self.count%self.upgrad_estep==0):
self.q_target_net.load_state_dict(self.q_eval_net.state_dict()) ##更新目标网络
self.count+=1
env_name="cartPole_v0"
env=gym.make(env_name)
Episode_num=1000
memory_capacity=10000
memeory=MemoryPool(memory_capacity)
upgrade_step=500
batch_size=64
hidden_dim=128
alpha=2e-3
action_num=env.action_space.n
state_num = env.observation_space.shape[0]
gamma=0.99
e_greedy=0.01
device = torch.device("cpu")
random.seed(0)
env.seed(0)
minimal_size = 500
np.random.seed(0)
torch.manual_seed(0)
agent=DQN(state_num,action_num,hidden_dim,gamma,alpha,e_greedy,upgrade_step,device)
for i in range(Episode_num):
state=env.reset()
episode_return=0 ##片段返回的回报
done=False
while not done:
action=agent.choose_action(state)
next_state,reward,done,_=env.step(action)
memeory.add(state,action,reward,next_state,done)
state=next_state
episode_return+=reward
if(memeory.size()>minimal_size): ##memory 中的数据量大于最低限度
b_s,b_a,b_r,b_ns,b_d=memeory.sample(batch_size)
transmition={'state':b_s,'action':b_a,'reward':b_r,'next_state':b_ns,'done':b_d}
agent.learn(transmition)
print(episode_return)
2.3 DEULINGDQN 算法
将动作与状态分开输出,我们定义优势函数和状态价值函数。
优势函数表征的是采取不同动作的差异性。为了保证两者的唯一性,强制最大动作的优势函数输出为0
补充: 每一次更新时,函数都会被更新到,这也会影响到其他动作的值;而传统的 DQN只会更新某个动作的值,其他动作的值就不会更新。这样能够更加频繁准确的学习到状态价值函数。
## 认为动作优势如下: A(s,a)=Q(s,a)-V(s,a)
## 在同一个状态下,认为该值=0(一个状态的价值等于他下所有动作的价值的期望之和
## 考虑Q网络 将其分别求出状态价值函数和动作优势函数,然后求和得到最后的Q函数
## 存在对于V值和A值的不确定性,我们强制最优动作的实际输出=0
## 相较于dqn的优势
# 每一次更新时,函数都会被更新到,这也会影响到其他动作的值;
# 而传统的 DQN 只会更新某个动作的值,其他动作的值就不会更新。
# 这样能够更加频繁准确的学习到状态价值函数。
import torch
import torch.nn.functional as F
import numpy as np
import tensorflow as tf
import random
import gym
import collections
## 记忆库
class MemoryPool:
def __init__(self,capacity):
self.memory=collections.deque(maxlen=capacity)
def sample(self,batch_size):
transition=random.sample(self.memory,batch_size)
states,actions,rewards,next_states,dones=zip(*transition)
return np.array(states),actions,rewards,np.array(next_states),dones
def add(self,c_state,c_action,reward,next_state,done):
self.memory.append((c_state,c_action,reward,next_state,done))
def size(self):
return len(self.memory)
## 神经网络
class V_ANet(torch.nn.Module):
def __init__(self,state_dim,action_dim,hidden_dim):
super(V_ANet,self).__init__()
self.fc1=torch.nn.Linear(state_dim,hidden_dim)
self.fc_A=torch.nn.Linear(hidden_dim,action_dim)
self.fc_V=torch.nn.Linear(hidden_dim,1)
def forward(self,input):
A=self.fc_A(F.relu(self.fc1(input)))
V=self.fc_V(F.relu(self.fc1(input)))
return A+V-A.mean(1).view(-1,1) ##Q值由这两个值得到,这里对于最大化动作用求对应的平均值
class DuelingDQN:
def __init__(self,state_dim,hidden_dim,action_dim,lr,gamma,e_greedy,upgrade_step):
self.action_dim=action_dim
self.q_net=V_ANet(state_dim,hidden_dim,action_dim)
self.targht_q_net=V_ANet(state_dim,hidden_dim,action_dim)
self.state_dim=state_dim
self.optimizer=torch.optim.Adam(self.q_net.parameters(),lr)
self.gamma=gamma
self.e_greedy=e_greedy
self.count=0
self.upgrade_step=upgrade_step
def choose_action(self,state):
if(np.random.random() <self.e_greedy):
action=np.random.randint(self.action_dim)
else:
state=torch.tensor([state],dtype=torch.float)
action=self.q_net(state).argmax().item()
return action
def learn(self,transition_dict):
states = torch.tensor(transition_dict['states'], dtype=torch.float)
actions = torch.tensor(transition_dict['actions']).view(-1, 1)
rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1)
next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float)
dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1)
q_value=self.q_net(states).gather(1,actions)
max_action=self.q_net(states).max(1)[1].view(-1,1)
max_next_q_value=self.targht_q_net(max_action).gather(1,max_action)
q_target=rewards+self.gamma*max_next_q_value(1-dones)
dqn_loss=torch.mean(F.mse_loss(q_value,q_target))
self.optimizer.zero_grad()
dqn_loss.backward()
self.optimizer.step()
if(self.count%self.upgrade_step==0):
self.targht_q_net.load_state_dict(self.q_net.state_dict())
self.count+=1
def load(self,file):
pass
四、作业四 多智能体协作
这里采用的是iql算法,
它将其他的智能体固定,看做环境的一部分,以单智能体的学习框架来独立训练每一个智能体的策略。
(具有极大的不稳定性,难收敛,跑了50000+次都没有收敛)
具体的算法就是DQN(除了多个智能体之间的共享网络和联合动作)
考虑的优化方向:
1.对于观测点的修改 :尝试做过添加步数作为观测点的一个维度(贪吃蛇的有限步数),效果不好
2.对于reward的修改。
3.对于超参数的修改:(由于贪吃蛇对于后续动作的期望不高,所以可以降低衰减参数,用了之后,确实效果变得较好)
五、作业五 多智能体竞争
这里可以采用iql的方式(不共享网络)来训练智能体
也可以与其他不同策略的智能体进行对抗训练。
这里考虑用DDQN,然后与DQN算法进行对抗训练。
(观测的修改,增加贪吃蛇的视野,奖励的修改,鼓励积极的吃豆子,惩罚危险的碰撞)
补充:一些优化的方向
多智能体竞争算法:
1.多阶段算法
额外信息,头的位置,四周的位置,豆子的位置,其他蛇相对的位置。
增大每轮结束后的胜利奖励,和失败奖励。
2.改后的ddqn算法
扩展维度,增加蛇的视野等。(与不同策略的蛇进行对战训练)
(getSourding方法拓展)
3.buffer 异常和重复数据
replay Buffer 并集操作
随机更新/数据效率/数据不充分? 对于数据要用对应的数据价值的更新
4.启发式算法
领土矩阵,防御行为。
部分优化思路代码:
1.遮掩不良动作:
# ##遮掩掉重复动作
# if(last_action==0 and action==1):
# min_action = torch.argmin(self.critic_eval(observation)).item()
# take_action = np.random.randint(self.action_dim)
# if (take_action != min_action):
# r_action = take_action
# else:
# r_action = action
# if(last_action==1 and action==0):
# min_action = torch.argmin(self.critic_eval(observation)).item()
# take_action = np.random.randint(self.action_dim)
# if (take_action != min_action):
# r_action = take_action
# else:
# r_action = action
# if(last_action==2 and action==3):
# min_action = torch.argmin(self.critic_eval(observation)).item()
# take_action = np.random.randint(self.action_dim)
# if (take_action != min_action):
# r_action = take_action
# else:
# r_action = action
# if(last_action==3 and action==2):
# min_action = torch.argmin(self.critic_eval(observation)).item()
# take_action = np.random.randint(self.action_dim)
# if (take_action != min_action):
# r_action = take_action
# else:
# r_action = action
2.观测维度增加:
## 观测视野增加一倍,维度变为24
# Self position: 0:head_x; 1:head_y
# Head surroundings: 2:head_up; 3:head_down; 4:head_left; 5:head_right
# Beans positions: (6, 7) (8, 9) (10, 11) (12, 13) (14, 15)
# Other snake positions: (16, 17) -- (other_x - self_x, other_y - self_y)
def get_observations(state, agent_trained_index, obs_dim):
state_copy = state.copy()
agents_index = state_copy["controlled_snake_index"]
if agents_index != agent_trained_index:
error = "训练的智能体:{name}, 观测的智能体:{url}".format(name=agents_index, url=agent_trained_index)
raise Exception(error)
board_width = state_copy['board_width']
board_height = state_copy['board_height']
beans_positions = state_copy[1]
snakes_positions = {key: state_copy[key] for key in state_copy.keys() & {2, 3}}
snake_map = make_grid_map(board_width, board_height, beans_positions, snakes_positions)
state = np.array(snake_map)
state = np.squeeze(snake_map, axis=2)
x=snakes_positions[agents_index]
snakes_position = np.array(snakes_positions[agents_index], dtype=object)
beans_position = np.array(beans_positions).flatten()
observations = np.zeros((1, obs_dim)) # todo
# self head position
observations[0][:2] = snakes_position[0][:]
# head surroundings
head_x = snakes_position[0][1]
head_y = snakes_position[0][0]
head_surrounding = get_surrounding(state, board_width, board_height, head_x, head_y)
observations[0][2:14] = head_surrounding[:]
# beans positions
observations[0][14:24] = beans_position[:]
# other snake head positions
snakes_other_position = np.array(snakes_positions[3], dtype=object) # todo
observations[0][24:] = snakes_other_position[0][:]
return observations
3.训练对手更换:
model = DDQN(obs_dim, action_dim, agent_trained_index, args)
model2=DDQN(obs_dim,action_dim,agent_compet_index,args)
model2.load("critic_18000.pth")
model.load("critic_18000.pth")
episode = 0
while episode < args.max_episodes:
state = env.reset()
state_rl_agent = get_state(state)
obs = get_observations(state_rl_agent, agent_trained_index, obs_dim)
state_rl_agent['controlled_snake_index'] = 3
obs2=get_observations(state_rl_agent,agent_compet_index,obs_dim)
episode += 1
step = 0
episode_reward = np.zeros(2)
last_action=-1
while True:
action = model.choose_action(obs,last_action)
action2=Greedy.my_controller(state_rl_agent,agent_compet_index,None)
# actions=[action,action2]
last_action=action
action2=model2.choose_action(obs2,last_action)
actions = [action,action2]
next_state, reward, done, _, _ = env.step(env.encode(actions))
next_state_rl_agent = get_state(next_state)
reward = np.array(reward)
episode_reward += reward
if done:
if episode_reward[0] > episode_reward[1]:
step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=1)
elif episode_reward[0] < episode_reward[1]:
step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=2)
else:
step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=3)
next_obs = np.zeros((ctrl_agent_num, obs_dim))
else:
step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=0)
next_obs = get_observations(next_state_rl_agent, agent_trained_index, obs_dim)
done = np.array([done] * ctrl_agent_num)
# store transitions
trans = Transition(obs, actions, step_reward, np.array(next_obs), done)
model.store_transition(trans)
# model.add(trans)
# model.add_experience(trans)
model.learn()
obs = next_obs
state = next_state
step += 1