RL习题整理

作业一主要是熟悉提交的框架,这里就不再详述,采用的算法也是random算法。

一、作业二

1.1QLearning

离线学习,下一个动作由之前的策略选择。

import pandas as pd
import numpy as np
import gym
import  matplotlib.pyplot as plt
from tqdm import tqdm
import pickle


### 认为下一个动作不是由当前的策略来选择得到的
class CliffWalkingEnv:
    def __init__(self, ncol, nrow):
        self.nrow = nrow
        self.ncol = ncol
        self.x = 0 # 记录当前智能体位置的横坐标
        self.y = self.nrow - 1 # 记录当前智能体位置的纵坐标

    def step(self, action): # 外部调用这个函数来让当前位置改变
        change = [[0, -1], [0, 1], [-1, 0], [1, 0]] # 4 种动作, 0:上, 1:下, 2:左, 3:右。原点(0,0)定义在左上角
        self.x = min(self.ncol - 1, max(0, self.x + change[action][0]))
        self.y = min(self.nrow - 1, max(0, self.y + change[action][1]))
        next_state = self.y * self.ncol + self.x
        reward = -1
        done = False
        if self.y == self.nrow - 1 and self.x > 0: # 下一个位置在悬崖或者终点
            done = True
            if self.x != self.ncol - 1:
                reward = -100
        return next_state, reward, done

    def reset(self): # 回归初始状态,坐标轴原点在左上角
        self.x = 0
        self.y = self.nrow - 1
        return self.y * self.ncol + self.x

class QLearning:
    def __init__(self,state_num,action_num,gammma,alpha,e_greedy):
        self.q_table=np.zeros([state_num,action_num])
        self.gamma=gammma
        self.alpha=alpha
        self.e_greedy=e_greedy
        self.action_num=action_num

    def choose_action(self,state):
        if(np.random.random() > self.e_greedy):
            action=np.random.randint(self.action_num)
        else:
            action=np.argmax(self.q_table[state])
        return action

    def learn(self,c_state,c_action,n_state,r):
        td_error=r+self.gamma*self.q_table[n_state].max()-self.q_table[c_state,c_action]
        self.q_table[c_state,c_action]+=self.alpha*td_error


ncol=12
nrow=4
env = CliffWalkingEnv(ncol, nrow)
epsilon = 0.9
alpha = 0.1
gamma = 0.9
Episode=600
action_num=4
state_num=48
np.random.seed(0)
agent = QLearning(state_num, action_num, gamma, alpha, epsilon)

num_episodes=1000
return_list = [] # 记录每一条序列的回报
for i in range(10): # 显示10个进度条
    with tqdm(total=int(num_episodes/10), desc='Iteration %d' % i) as pbar: # tqdm的进度条功能
        for i_episode in range(int(num_episodes/10)): # 每个进度条的序列数
            episode_return = 0
            state = env.reset()
            done = False
            while not done:
                action = agent.choose_action(state)
                next_state, reward, done = env.step(action)
                episode_return += reward # 这里回报的计算不进行折扣因子衰减
                agent.learn(state, action,next_state,reward)
                state = next_state
            return_list.append(episode_return)
            if (i_episode+1) % 10 == 0: # 每10条序列打印一下这10条序列的平均回报
                pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode+1), 'return': '%.3f' % np.mean(return_list[-10:])})
            pbar.update(1)

episodes_list = list(range(len(return_list)))
plt.plot(episodes_list, return_list)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('Q-learning on {}'.format('Cliff Walking'))
plt.show()

with open('q_learning.pth', 'wb') as f:
    pickle.dump(agent.q_table, f)

1.2 Sarsa算法

在线学习,与Qlearning不同,它的下一步的动作是由当前的策略选择的。

class Sarsa:
    def __init__(self,state_num,action_num,gammma,alpha,e_greedy):
        self.q_table=np.zeros([state_num,action_num])
        self.gamma=gammma
        self.alpha=alpha
        self.e_greedy=e_greedy
        self.action_num=action_num

    def choose_action(self,state):
        if(np.random.random() <self.e_greedy):
            action=np.random.randint(self.action_num)
        else:
            action=np.argmax(self.q_table[state])
        return action

    def learn(self,c_state,c_action,n_state,n_action,r):
        td_error=r+self.gamma*self.q_table[n_state,n_action]-self.q_table[c_state,c_action]
        self.q_table[c_state,c_action]+=self.alpha*td_error

1.3 n_step Sarsa

class nstep_Sarsa:
    def __init__(self,state_num,action_num,gammma,alpha,e_greedy,n_step):
        self.q_table=np.zeros([state_num,action_num])
        self.action_num=action_num
        self.gamma=gammma
        self.alpha=alpha
        self.e_greedy=e_greedy
        self.n_step=n_step
        self.state_list=[]
        self.action_list=[]
        self.reward_list=[]

    def choose_action(self,state):

        if(np.random.random()<self.e_greedy):
            action=np.random.randint(self.action_num)
        else:
            action=np.argmax(self.q_table[state])
        return action

    def learn(self,c_state,c_action,n_state,n_action,r,done):
        self.state_list.append(c_state)
        self.action_list.append(c_action)
        self.reward_list.append(r)
        if len(self.state_list) == self.n_step:  # 若保存的数据可以进行n步更新
            G = self.q_table[n_state, n_action]  # 得到Q(s_{t+n}, a_{t+n})
            for i in reversed(range(self.n_step)):
                G = self.gamma * G + self.reward_list[i]  # 不断向前计算每一步的回报
                if done and i > 0:  # 如果到达终止状态,最后几步虽然长度不够n步,我们也对其进行更新
                    s = self.state_list[i]
                    a = self.action_list[i]
                    self.q_table[s, a] += self.alpha * (G - self.q_table[s, a])
            s = self.state_list.pop(0)  # 需要更新的状态动作从列表中删除,下次不必更新
            a = self.action_list.pop(0)
            self.reward_list.pop(0)
            self.q_table[s, a] += self.alpha * (G - self.q_table[s, a])  # n-step sarsa的主要更新步骤
        if done:  # 如果到达终止状态,即将开始下一条序列,则将列表全清空
            self.state_list = []
            self.action_list = []
            self.reward_list = []

1.4 修改的Sarsa

增加了对于探索的奖励。

import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm # tqdm是显示循环进度条的库

## add exploration reward

class nstep_Sarsachange:
    """ n步Sarsa算法 """

    def __init__(self, n, ncol, nrow, epsilon, alpha, gamma, n_action=4):
        self.Q_table = np.zeros([nrow * ncol, n_action])
        self.n_action = n_action
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.try_state=[]
        self.n = n  # 采用n步Sarsa
        self.state_list = []  # 保存之前的状态
        self.action_list = []  # 保存之前的动作
        self.reward_list = []  # 保存之前的奖励

    def take_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.randint(self.n_action)
        else:
            action = np.argmax(self.Q_table[state])
        return action

    def best_action(self, state):  # 用于打印策略
        Q_max = np.max(self.Q_table[state])
        a = [0 for _ in range(self.n_action)]
        for i in range(self.n_action):
            if self.Q_table[state, i] == Q_max:
                a[i] = 1
        return a

    def update(self, s0, a0, r, s1, a1, done):
        self.state_list.append(s0)
        self.action_list.append(a0)
        self.reward_list.append(r)
        penelrewar=0
        if(s0 not in self.try_state):
            penelrewar=0.2
            self.try_state.append(s0)
        if len(self.state_list) == self.n:  # 若保存的数据可以进行n步更新
            G = self.Q_table[s1, a1]  # 得到Q(s_{t+n}, a_{t+n})
            for i in reversed(range(self.n)):
                G = self.gamma * G + self.reward_list[i] +penelrewar # 不断向前计算每一步的回报
                if done and i > 0:  # 如果到达终止状态,最后几步虽然长度不够n步,我们也对其进行更新
                    s = self.state_list[i]
                    a = self.action_list[i]
                    self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a])
            s = self.state_list.pop(0)  # 需要更新的状态动作从列表中删除,下次不必更新
            a = self.action_list.pop(0)
            self.reward_list.pop(0)
            self.Q_table[s, a] += self.alpha * (G - self.Q_table[s, a])  # n-step sarsa的主要更新步骤
        if done:  # 如果到达终止状态,即将开始下一条序列,则将列表全清空
            self.state_list = []
            self.action_list = []
            self.reward_list = []

二、作业三

2.1 DQN 算法

对于连续状态的环境,利用表的方式难以存储对应的状态动作,所以采用神经网络来进行估计动作价值。

import numpy as np
import gym
import matplotlib.pyplot as plt
import torch
import tqdm
import tensorflow
import collections
import random
import torch.nn.functional as F
##经验池
class MemoryPool:
    def __init__(self,capacity):
        self.memory=collections.deque(maxlen=capacity)  ##创建一个队列

    ##采样
    def sample(self,batch_szie):
        transition=random.sample(self.memory,batch_szie) ##采样采出来是个列表,随机挑选n条数据
        c_state,c_action,reward,next_state,done=zip(*transition)  #解包,用元组存储
        return np.array(c_state),c_action,reward,np.array(next_state),done

    ##添加记忆
    def add(self,c_state,c_action,reward,n_state,done):
        self.memory.append((c_state,c_action,reward,n_state,done))

    ## 返回长度
    def size(self):
        return len(self.memory)

##建立神经网络
class Q_net(torch.nn.Module):
    def __init__(self,state_num,hidden_num,action_num):
        super(Q_net,self).__init__()
        self.fc1=torch.nn.Linear(state_num,hidden_num)
        self.fc2=torch.nn.Linear(hidden_num,action_num)

    ## 前向传播
    def forward(self,x):   ##前向传播计算q值
        x=F.relu(self.fc1(x)) ##只用一层激活函数
        return self.fc2(x)


class DQN:
    ##初始化
    def __init__(self,state_num,action_num,hidden_num,gamma,alpha,e_greedy,upgrade_step,device):
        self.action_num=action_num
        self.q_eval_net=Q_net(state_num,hidden_num,action_num)
        self.q_target_net=Q_net(state_num,hidden_num,action_num)
        self.gamma=gamma
        self.alpha=alpha
        self.e_greedy=e_greedy
        self.count=0
        self.upgrad_estep=upgrade_step
        self.optimizer=torch.optim.Adam(self.q_eval_net.parameters(),lr=self.alpha)

    ##选择动作
    def choose_action(self,state):
        if(np.random.random()<self.e_greedy):
            action=np.random.randint(self.action_num)
        else:
            state=torch.tensor([state],dtype=torch.float)
            action=self.q_eval_net(state).argmax().item()
        return action

    ##学习
    def learn(self,trainsion):
        states=torch.tensor(trainsion['state'],dtype=torch.float)
        actions=torch.tensor(trainsion['action']).view(-1,1)
        ##view(-1,1) 意味着不知道几行,但需要一列,类似于reshape
        rewards=torch.tensor(trainsion['reward'],dtype=torch.float).view(-1,1)
        dones=torch.tensor(trainsion['done'],dtype=torch.float).view(-1,1)
        n_states=torch.tensor(trainsion['next_state'],dtype=torch.float)

        q_eval=self.q_eval_net(states).gather(1,actions)  ##得到q值
        max_next_q_value=self.q_target_net(n_states).max(1)[0].view(-1,1)

        target_q=rewards+self.gamma*max_next_q_value*(1-dones) ##如果当前是结束状态,那么reward就是最后的reward
        dqn_loss=torch.mean(F.mse_loss(target_q,q_eval))   ##均方误差   mean 是求平均值,mseloss 均方误差  (这两个值是个向量)
        self.optimizer.zero_grad() ##将梯度置零
        dqn_loss.backward()   ##反向传播求梯度
        self.optimizer.step()  ##更新参数
        if(self.count%self.upgrad_estep==0):
            self.q_target_net.load_state_dict(self.q_eval_net.state_dict()) ##更新目标网络
        self.count+=1


env_name="CartPole-v0"
env=gym.make(env_name)
Episode_num=1000
memory_capacity=10000
memeory=MemoryPool(memory_capacity)
upgrade_step=500
batch_size=64
hidden_dim=128
alpha=2e-3
action_num=env.action_space.n
state_num = env.observation_space.shape[0]
gamma=0.99
e_greedy=0.01
device = torch.device("cpu")
random.seed(0)
env.seed(0)
minimal_size = 500
np.random.seed(0)
torch.manual_seed(0)
agent=DQN(state_num,action_num,hidden_dim,gamma,alpha,e_greedy,upgrade_step,device)
for i in range(Episode_num):
    state=env.reset()
    episode_return=0  ##片段返回的回报
    done=False
    while not done:
        action=agent.choose_action(state)
        next_state,reward,done,_=env.step(action)
        memeory.add(state,action,reward,next_state,done)
        state=next_state
        episode_return+=reward
        if(memeory.size()>minimal_size):   ##memory 中的数据量大于最低限度
            b_s,b_a,b_r,b_ns,b_d=memeory.sample(batch_size)
            transmition={'state':b_s,'action':b_a,'reward':b_r,'next_state':b_ns,'done':b_d}
            agent.learn(transmition)
    print(episode_return)

2.2 DDQN 算法

DQN中的对于下一个动作的选取与估计都采用的是target network ,容易出现过高的估计。
所以我们这里采用eval network去选择动作,用target network 去估计该动作的价值来减少估计的偏高。

import numpy as np
import gym
import matplotlib.pyplot as plt
import torch
import tqdm
import tensorflow
import collections
import random
import torch.nn.functional as F
##经验池
class MemoryPool:
    def __init__(self,capacity):
        self.memory=collections.deque(maxlen=capacity)  ##创建一个队列

    ##采样
    def sample(self,batch_szie):
        transition=random.sample(self.memory,batch_szie) ##采样采出来是个列表,随机挑选n条数据
        c_state,c_action,reward,next_state,done=zip(*transition)  #解包,用元组存储
        return np.array(c_state),c_action,reward,np.array(next_state),done

    ##添加记忆
    def add(self,c_state,c_action,reward,n_state,done):
        self.memory.append((c_state,c_action,reward,n_state,done))

    ## 返回长度
    def size(self):
        return len(self.memory)

##建立神经网络
class Q_net(torch.nn.Module):
    def __init__(self,state_num,hidden_num,action_num):
        super(Q_net,self).__init__()
        self.fc1=torch.nn.Linear(state_num,hidden_num)
        self.fc2=torch.nn.Linear(hidden_num,action_num)

    ## 前向传播
    def forward(self,x):   ##前向传播计算q值
        x=F.relu(self.fc1(x)) ##只用一层激活函数?
        return self.fc2(x)


class DQN:
    ##初始化
    def __init__(self,state_num,action_num,hidden_num,gamma,alpha,e_greedy,upgrade_step,device):
        self.action_num=action_num
        self.q_eval_net=Q_net(state_num,hidden_num,action_num)
        self.q_target_net=Q_net(state_num,hidden_num,action_num)
        self.gamma=gamma
        self.alpha=alpha
        self.e_greedy=e_greedy
        self.count=0
        self.upgrad_estep=upgrade_step
        self.optimizer=torch.optim.Adam(self.q_eval_net.parameters(),lr=self.alpha)

    ##选择动作
    def choose_action(self,state):
        if(np.random.random()<self.e_greedy):
            action=np.random.randint(self.action_num)
        else:
            state=torch.tensor([state],dtype=torch.float)
            action=self.q_eval_net(state).argmax().item()
        return action

    ##学习
    def learn(self,trainsion):
        states=torch.tensor(trainsion['state'],dtype=torch.float)
        actions=torch.tensor(trainsion['action']).view(-1,1)
        ##view(-1,1) 意味着不知道几行,但需要一列,类似于reshape
        rewards=torch.tensor(trainsion['reward'],dtype=torch.float).view(-1,1)
        dones=torch.tensor(trainsion['done'],dtype=torch.float).view(-1,1)
        n_states=torch.tensor(trainsion['next_state'],dtype=torch.float)

        q_eval=self.q_eval_net(states).gather(1,actions)  ##得到q值
        max_action = self.q_eval_net(n_states).max(1)[1].view(-1, 1)  ##训练网路去选取下一个动作
        max_next_q_value=self.q_target_net(n_states).gather(1,max_action)##计算还是用target,减少了过高估计
        
        target_q=rewards+self.gamma*max_next_q_value*(1-dones) ##如果当前是结束状态,那么reward就是最后的reward
        dqn_loss=torch.mean(F.mse_loss(target_q,q_eval))   ##均方误差   mean 是求平均值,mseloss 均方误差  (这两个值是个向量)
        self.optimizer.zero_grad() ##将梯度置零
        dqn_loss.backward()   ##反向传播求梯度
        self.optimizer.step()  ##更新参数
        if(self.count%self.upgrad_estep==0):
            self.q_target_net.load_state_dict(self.q_eval_net.state_dict()) ##更新目标网络
        self.count+=1



env_name="cartPole_v0"
env=gym.make(env_name)
Episode_num=1000
memory_capacity=10000
memeory=MemoryPool(memory_capacity)
upgrade_step=500
batch_size=64
hidden_dim=128
alpha=2e-3
action_num=env.action_space.n
state_num = env.observation_space.shape[0]
gamma=0.99
e_greedy=0.01
device = torch.device("cpu")
random.seed(0)
env.seed(0)
minimal_size = 500
np.random.seed(0)
torch.manual_seed(0)
agent=DQN(state_num,action_num,hidden_dim,gamma,alpha,e_greedy,upgrade_step,device)
for i in range(Episode_num):
    state=env.reset()
    episode_return=0  ##片段返回的回报
    done=False
    while not done:
        action=agent.choose_action(state)
        next_state,reward,done,_=env.step(action)
        memeory.add(state,action,reward,next_state,done)
        state=next_state
        episode_return+=reward
        if(memeory.size()>minimal_size):   ##memory 中的数据量大于最低限度
            b_s,b_a,b_r,b_ns,b_d=memeory.sample(batch_size)
            transmition={'state':b_s,'action':b_a,'reward':b_r,'next_state':b_ns,'done':b_d}
            agent.learn(transmition)
    print(episode_return)

2.3 DEULINGDQN 算法

将动作与状态分开输出,我们定义优势函数和状态价值函数。
优势函数表征的是采取不同动作的差异性。为了保证两者的唯一性,强制最大动作的优势函数输出为0

补充: 每一次更新时,函数都会被更新到,这也会影响到其他动作的值;而传统的 DQN只会更新某个动作的值,其他动作的值就不会更新。这样能够更加频繁准确的学习到状态价值函数。

## 认为动作优势如下: A(s,a)=Q(s,a)-V(s,a)
## 在同一个状态下,认为该值=0(一个状态的价值等于他下所有动作的价值的期望之和
## 考虑Q网络  将其分别求出状态价值函数和动作优势函数,然后求和得到最后的Q函数
## 存在对于V值和A值的不确定性,我们强制最优动作的实际输出=0

## 相较于dqn的优势
# 每一次更新时,函数都会被更新到,这也会影响到其他动作的值;
# 而传统的 DQN 只会更新某个动作的值,其他动作的值就不会更新。
# 这样能够更加频繁准确的学习到状态价值函数。

import torch
import torch.nn.functional as F
import numpy as np
import tensorflow as tf
import random
import gym
import collections


## 记忆库
class MemoryPool:
    def __init__(self,capacity):
        self.memory=collections.deque(maxlen=capacity)

    def sample(self,batch_size):
        transition=random.sample(self.memory,batch_size)
        states,actions,rewards,next_states,dones=zip(*transition)
        return np.array(states),actions,rewards,np.array(next_states),dones

    def add(self,c_state,c_action,reward,next_state,done):
        self.memory.append((c_state,c_action,reward,next_state,done))

    def size(self):
        return len(self.memory)


## 神经网络
class V_ANet(torch.nn.Module):
    def __init__(self,state_dim,action_dim,hidden_dim):
        super(V_ANet,self).__init__()
        self.fc1=torch.nn.Linear(state_dim,hidden_dim)
        self.fc_A=torch.nn.Linear(hidden_dim,action_dim)
        self.fc_V=torch.nn.Linear(hidden_dim,1)

    def forward(self,input):
        A=self.fc_A(F.relu(self.fc1(input)))
        V=self.fc_V(F.relu(self.fc1(input)))
        return A+V-A.mean(1).view(-1,1)  ##Q值由这两个值得到,这里对于最大化动作用求对应的平均值



class DuelingDQN:
    def __init__(self,state_dim,hidden_dim,action_dim,lr,gamma,e_greedy,upgrade_step):
        self.action_dim=action_dim
        self.q_net=V_ANet(state_dim,hidden_dim,action_dim)
        self.targht_q_net=V_ANet(state_dim,hidden_dim,action_dim)
        self.state_dim=state_dim
        self.optimizer=torch.optim.Adam(self.q_net.parameters(),lr)
        self.gamma=gamma
        self.e_greedy=e_greedy
        self.count=0
        self.upgrade_step=upgrade_step

    def choose_action(self,state):
        if(np.random.random() <self.e_greedy):
            action=np.random.randint(self.action_dim)
        else:
            state=torch.tensor([state],dtype=torch.float)
            action=self.q_net(state).argmax().item()
        return action

    def learn(self,transition_dict):
        states = torch.tensor(transition_dict['states'], dtype=torch.float)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1)

        q_value=self.q_net(states).gather(1,actions)
        max_action=self.q_net(states).max(1)[1].view(-1,1)
        max_next_q_value=self.targht_q_net(max_action).gather(1,max_action)

        q_target=rewards+self.gamma*max_next_q_value(1-dones)
        dqn_loss=torch.mean(F.mse_loss(q_value,q_target))
        self.optimizer.zero_grad()
        dqn_loss.backward()
        self.optimizer.step()

        if(self.count%self.upgrade_step==0):
            self.targht_q_net.load_state_dict(self.q_net.state_dict())
        self.count+=1
        
    def load(self,file):
        pass

四、作业四 多智能体协作

这里采用的是iql算法,
它将其他的智能体固定,看做环境的一部分,以单智能体的学习框架来独立训练每一个智能体的策略。
(具有极大的不稳定性,难收敛,跑了50000+次都没有收敛)
具体的算法就是DQN(除了多个智能体之间的共享网络和联合动作)

考虑的优化方向:
1.对于观测点的修改 :尝试做过添加步数作为观测点的一个维度(贪吃蛇的有限步数),效果不好

2.对于reward的修改。

3.对于超参数的修改:(由于贪吃蛇对于后续动作的期望不高,所以可以降低衰减参数,用了之后,确实效果变得较好)

五、作业五 多智能体竞争

这里可以采用iql的方式(不共享网络)来训练智能体
也可以与其他不同策略的智能体进行对抗训练。
这里考虑用DDQN,然后与DQN算法进行对抗训练。

(观测的修改,增加贪吃蛇的视野,奖励的修改,鼓励积极的吃豆子,惩罚危险的碰撞)

补充:一些优化的方向
多智能体竞争算法:
1.多阶段算法
在这里插入图片描述额外信息,头的位置,四周的位置,豆子的位置,其他蛇相对的位置。
增大每轮结束后的胜利奖励,和失败奖励。

2.改后的ddqn算法

在这里插入图片描述
扩展维度,增加蛇的视野等。(与不同策略的蛇进行对战训练)
(getSourding方法拓展)

3.buffer 异常和重复数据
replay Buffer 并集操作
随机更新/数据效率/数据不充分? 对于数据要用对应的数据价值的更新

4.启发式算法
在这里插入图片描述领土矩阵,防御行为。
在这里插入图片描述

部分优化思路代码:
1.遮掩不良动作:

 # ##遮掩掉重复动作
        # if(last_action==0 and action==1):
        #     min_action = torch.argmin(self.critic_eval(observation)).item()
        #     take_action = np.random.randint(self.action_dim)
        #     if (take_action != min_action):
        #         r_action = take_action
        #     else:
        #         r_action = action
        # if(last_action==1 and action==0):
        #     min_action = torch.argmin(self.critic_eval(observation)).item()
        #     take_action = np.random.randint(self.action_dim)
        #     if (take_action != min_action):
        #         r_action = take_action
        #     else:
        #         r_action = action
        # if(last_action==2 and action==3):
        #     min_action = torch.argmin(self.critic_eval(observation)).item()
        #     take_action = np.random.randint(self.action_dim)
        #     if (take_action != min_action):
        #         r_action = take_action
        #     else:
        #         r_action = action
        # if(last_action==3 and action==2):
        #     min_action = torch.argmin(self.critic_eval(observation)).item()
        #     take_action = np.random.randint(self.action_dim)
        #     if (take_action != min_action):
        #         r_action = take_action
        #     else:
        #         r_action = action

2.观测维度增加:

## 观测视野增加一倍,维度变为24

# Self position:        0:head_x; 1:head_y
# Head surroundings:    2:head_up; 3:head_down; 4:head_left; 5:head_right
# Beans positions:      (6, 7) (8, 9) (10, 11) (12, 13) (14, 15)
# Other snake positions: (16, 17) -- (other_x - self_x, other_y - self_y)
def get_observations(state, agent_trained_index, obs_dim):
    state_copy = state.copy()

    agents_index = state_copy["controlled_snake_index"]

    if agents_index != agent_trained_index:
        error = "训练的智能体:{name}, 观测的智能体:{url}".format(name=agents_index, url=agent_trained_index)
        raise Exception(error)

    board_width = state_copy['board_width']
    board_height = state_copy['board_height']
    beans_positions = state_copy[1]
    snakes_positions = {key: state_copy[key] for key in state_copy.keys() & {2, 3}}
    snake_map = make_grid_map(board_width, board_height, beans_positions, snakes_positions)
    state = np.array(snake_map)
    state = np.squeeze(snake_map, axis=2)
    x=snakes_positions[agents_index]
    snakes_position = np.array(snakes_positions[agents_index], dtype=object)

    beans_position = np.array(beans_positions).flatten()

    observations = np.zeros((1, obs_dim)) # todo

    # self head position
    observations[0][:2] = snakes_position[0][:]

    # head surroundings
    head_x = snakes_position[0][1]
    head_y = snakes_position[0][0]
    head_surrounding = get_surrounding(state, board_width, board_height, head_x, head_y)
    observations[0][2:14] = head_surrounding[:]

    # beans positions
    observations[0][14:24] = beans_position[:]

    # other snake head positions
    snakes_other_position = np.array(snakes_positions[3], dtype=object) # todo
    observations[0][24:] = snakes_other_position[0][:]

    return observations

3.训练对手更换:


    model = DDQN(obs_dim, action_dim, agent_trained_index, args)
    model2=DDQN(obs_dim,action_dim,agent_compet_index,args)
    model2.load("critic_18000.pth")
    model.load("critic_18000.pth")
    episode = 0

    while episode < args.max_episodes:
        state = env.reset()

        state_rl_agent = get_state(state)


        obs = get_observations(state_rl_agent, agent_trained_index, obs_dim)
        state_rl_agent['controlled_snake_index'] = 3
        obs2=get_observations(state_rl_agent,agent_compet_index,obs_dim)
        episode += 1
        step = 0
        episode_reward = np.zeros(2)
        last_action=-1
        while True:
            action = model.choose_action(obs,last_action)
            action2=Greedy.my_controller(state_rl_agent,agent_compet_index,None)

            # actions=[action,action2]
            last_action=action
            action2=model2.choose_action(obs2,last_action)
            actions = [action,action2]

            next_state, reward, done, _, _ = env.step(env.encode(actions))

            next_state_rl_agent = get_state(next_state)

            reward = np.array(reward)
            episode_reward += reward

            if done:
                if episode_reward[0] > episode_reward[1]:
                    step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=1)
                elif episode_reward[0] < episode_reward[1]:
                    step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=2)
                else:
                    step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=3)
                next_obs = np.zeros((ctrl_agent_num, obs_dim))
            else:
                step_reward = get_reward(next_state_rl_agent, ctrl_agent_index, reward, final_result=0)
                next_obs = get_observations(next_state_rl_agent, agent_trained_index, obs_dim)

            done = np.array([done] * ctrl_agent_num)

            # store transitions
            trans = Transition(obs, actions, step_reward, np.array(next_obs), done)
            model.store_transition(trans)
            # model.add(trans)
            # model.add_experience(trans)
            model.learn()

            obs = next_obs
            state = next_state
            step += 1
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值