强化学习:伪代码汇总及用DQN求解MountainCar-v0问题代码

半梯度Sarsa 伪代码

在这里插入图片描述

Q-Learning 伪代码

在这里插入图片描述

DQN伪代码

DQN利用经验回放和TD网络打破数据间关联,实现神经网络收敛及稳定性

在这里插入图片描述
[ 14 ] : θ t + 1 ← θ t + α [ r t + γ max ⁡ a ′ Q ( s t + 1 , a ′ ; θ − ) − Q ( s t , a t ; θ ) ] ∇ Q ( s t , a t ; θ ) [14]:\theta_{t+1}\leftarrow \theta_{t} + \alpha[r_t+\gamma\max_{a^{'}}Q(s_{t+1},a^{'};\theta^-)-Q(s_{t},a_t;\theta)]\nabla Q(s_t,a_t;\theta) [14]:θt+1θt+α[rt+γmaxaQ(st+1,a;θ)Q(st,at;θ)]Q(st,at;θ)

[ 15 ] : θ = θ + Δ θ [15]:\theta =\theta +\Delta \theta [15]:θ=θ+Δθ
[ 16 ] : θ − = θ [16]:\theta^{-}=\theta [16]:θ=θ

Double DQN 伪代码

​ double DQN 克服了DQN与Q-Learning的过估计问题

在这里插入图片描述

Dueling DQN网络

​ Dueling DQN此部分网络结构上改进DQN。将动作价值函数分解为:
Q π ( s , a ) = V π ( s ) + A π ( s , a ) Q^{\pi}(s,a)=V^{\pi}(s)+A^{\pi}(s,a) Qπ(s,a)=Vπ(s)+Aπ(s,a)
在这里插入图片描述

代码

各种要导入的库:

import sys
import numpy as np
sys.path.append(r'D:\Anaconda3\envs\pythonProjectNewStart\Lib\site-packages')
import torch
# 导入torch的各种模块
import torch.nn as nn
from torch.nn import functional as F
from torch.distributions import Categorical
import gym
# 环境类型
env = gym.make("MountainCar-v0")
env = env.unwrapped
print("初始状态{}".format(np.array(env.reset())))

经验回放的数组类:

# 经验回放数组类
class ReplayBuffer(object):
    def __init__(self,caplicity):
        self.Buffer = deque(maxlen=caplicity)
    # 往经验数组中加入新的数据
    def push(self,state,action,reward,next_state,done):
        state = np.expand_dims(state,0)
        next_state = np.expand_dims(next_state,0)
        self.Buffer.append((state,action,reward,next_state,done))
        # 从ReplayBuffer中抽取batchsize大小的数据
    def sample(self,batchsize):
        # 各个数组按时间排序抽出
        state_array = []
        action_array = []
        reward_array = []
        next_state_array = []
        done_array = []
        # 随机抽样导出数据
        batchsize_buffer = random.sample(self.Buffer,batchsize)
        for unit in zip(batchsize_buffer):
            state,action,reward,next_state,done = unit[0]
            state_array.append(state)
            action_array.append(action)
            reward_array.append(reward)
            next_state_array.append(next_state)
            done_array.append(done)
        return state_array,action_array,reward_array,next_state_array,done_array
    # 返回当前ReplayBuffer的尺寸
    def __len__(self):
        return len(self.Buffer)

DQN网络的搭建:

# 下面搭建神经网络DQN
class DQN(nn.Module):
    def __init__(self,num_states,num_actions,num_hidden=64,learning_rate=0.01):
        super(DQN,self).__init__()
        self.num_states = num_states
        self.num_actions = num_actions
        # 下面是定义的网络架构
        self.layers = nn.Sequential(
        nn.Linear(num_states,num_hidden),
        nn.ReLU(),
        nn.Linear(num_hidden,num_hidden),
        nn.ReLU(),
        nn.Linear(num_hidden,num_actions)
        )
        self.optimzer = torch.optim.Adam(self.parameters(),lr=learning_rate)
    def forward(self,x):
        # 输入x为torch数据
        return self.layers(x)
    # 下面是根据神经网络参数选择动作
    def select_action(self,state,epsilon=0.1):
        # 输入state为numpy类型数据
        # 输出为int
        if np.random.random() <= epsilon:
            return np.random.randint(0,int(self.num_actions))
        else:
            q_value = self.forward(torch.tensor(state,dtype=torch.float))
            action = torch.argmax(q_value).item()
            return action
    # 下面是策略更新
    def update_policy(self,replay_buffer,batchsize,gamma=0.9):
        # batchsize:从replayBuffer中抽取数据进行初始化
        state_array,action_array,reward_array,next_state_array,done_array = replay_buffer.sample(batchsize)
        # 将抽出的四元组转化为torch类型
        state_array = torch.tensor(state_array,dtype=torch.float)
        action_array = torch.tensor(action_array,dtype=torch.int)
        reward_array = torch.tensor(reward_array,dtype=torch.float)
        next_state_array = torch.tensor(next_state_array,dtype=torch.float)
        done_array = torch.tensor(done_array)
        # 计算loss
        loss = []
        ## 先计算Q(s,a)
        for t in range(len(state_array)):
            s = state_array[t]
            a = action_array[t]
            r = reward_array[t]
            s_ = next_state_array[t]
            q = self.forward(s)[0][a]
            done = done_array[t]
            if done == torch.tensor(True):
                q_target = r 
            else:
                q_target = r + gamma*torch.max(self.forward(s_)[0])
            loss.append((q-q_target).pow(2))
        loss = torch.mean(torch.tensor(loss))
        self.optimzer.zero_grad()
        loss.requires_grad_(True)
        loss.backward()
        self.optimzer.step()
        return loss

ε \varepsilon ε的调整函数:

# 随着学习过程中慢慢减小探索率epsilon
def epsilon_set(epsiode,epsilon_start=1.0,epsilon_final=0.01,epsilon_decay=100):
    # epsiode:探索步长
    return epsilon_final + (epsilon_start - epsilon_final)*np.exp(-epsiode/epsilon_decay)

定义环境及各个类:

# 定义环境env以及DQN网络以及经验数组replayBuffer
env = gym.make("MountainCar-v0")
env = env.unwrapped
DQN_network = DQN(num_states=2,num_actions=3)
replay_buffer = ReplayBuffer(caplicity=10000)

定义主函数:

# 定义主函数
def main(epsiodes=50,batchsize=100,global_step=30,gamma=0.9):
    # 其中global_step表示每隔多长的步长更新一次
    reward_array = []
    mean_reward_array = []
    losses = [] 
    global_count = 0
    for epsiode in range(epsiodes):
        state = env.reset()
        epsiode_reward = 0.0
        while True:
            # 先计算epsilon
            epsilon = epsilon_set(epsiode)
            # 再采样动作(a)
            action = DQN_network.select_action(state,epsilon=epsilon)
            # 再得到(r,s_,done)
            next_state,reward,done,_,_ = env.step(action)
            # 存储(s,a,r,s_,done)于经验数组中
            replay_buffer.push(state,action,reward,next_state,done)
            # 状态迭代s <- s_
            state = next_state
            epsiode_reward += reward
            global_count += 1
            # 从replayBuffer中采样作为参数更新
            if (replay_buffer.__len__()>=batchsize)and(global_count%global_step == 0):
                loss = DQN_network.update_policy(replay_buffer,batchsize,gamma=gamma)
                losses.append(loss)
            # 当当前是结束状态时退出
            if done:
                break
        reward_array.append(epsiode_reward)
        mean_reward_array.append(np.mean(reward_array))
        print("epsiode:{},rewards:{},mean_rewards:{}".format(epsiode,reward_array[-1],int(mean_reward_array[-1])))
    return reward_array,mean_reward_array,losses

执行部分:

# 不带target_network的运行
reward_array,mean_reward_array,losses = main(epsiodes=50,batchsize=50)
plt.plot(reward_array)
plt.plot(mean_reward_array)
plt.plot(losses)

效果:

epsiode:0,rewards:-44051.0,mean_rewards:-44051
epsiode:1,rewards:-19320.0,mean_rewards:-31685
epsiode:2,rewards:-9235.0,mean_rewards:-24202
epsiode:3,rewards:-52916.0,mean_rewards:-31380
epsiode:4,rewards:-142999.0,mean_rewards:-53704
epsiode:5,rewards:-102141.0,mean_rewards:-61777
epsiode:6,rewards:-107398.0,mean_rewards:-68294
epsiode:7,rewards:-2794.0,mean_rewards:-60106
epsiode:8,rewards:-161268.0,mean_rewards:-71346
epsiode:9,rewards:-1394702.0,mean_rewards:-203682
epsiode:10,rewards:-432946.0,mean_rewards:-224524
epsiode:11,rewards:-16071.0,mean_rewards:-207153
epsiode:12,rewards:-1041327.0,mean_rewards:-271320
epsiode:13,rewards:-583157.0,mean_rewards:-293594
epsiode:14,rewards:-406127.0,mean_rewards:-301096
epsiode:15,rewards:-1726005.0,mean_rewards:-390153
...

简单来说就已经是震荡的看不下去了…

修改之后的DQN代码

import torch                                    # 导入torch
import torch.nn as nn                           # 导入torch.nn
import torch.nn.functional as F                 # 导入torch.nn.functional
import numpy as np                              # 导入numpy
import gym                                      # 导入gym

# 超参数
BATCH_SIZE = 500                                 # 样本数量
LR = 0.01                                       # 学习率
EPSILON = 0.9                                   # greedy policy
GAMMA = 0.9                                     # reward discount
TARGET_REPLACE_ITER = 100                       # 目标网络更新频率
MEMORY_CAPACITY = 20000                         # 记忆库容量
env = gym.make('MountainCar-v0').unwrapped      # 使用gym库中的环境:CartPole,且打开封装(若想了解该环境,请自行百度)
N_ACTIONS = env.action_space.n                  # 杆子动作个数 (2个)
N_STATES = env.observation_space.shape[0]       # 杆子状态个数 (4个)
EPSIODES = 500

# 定义Net类 (定义网络)
class Net(nn.Module):
    def __init__(self):                                                         # 定义Net的一系列属性
        # nn.Module的子类函数必须在构造函数中执行父类的构造函数
        super(Net, self).__init__()                                             # 等价与nn.Module.__init__()

        self.fc1 = nn.Linear(N_STATES, 60)                                      # 设置第一个全连接层(输入层到隐藏层): 状态数个神经元到50个神经元
        self.fc1.weight.data.normal_(0, 0.1)                                    # 权重初始化 (均值为0,方差为0.1的正态分布)
        self.out = nn.Linear(60, N_ACTIONS)                                     # 设置第二个全连接层(隐藏层到输出层): 50个神经元到动作数个神经元
        self.out.weight.data.normal_(0, 0.1)                                    # 权重初始化 (均值为0,方差为0.1的正态分布)

    def forward(self, x):                                                       # 定义forward函数 (x为状态)
        x = F.relu(self.fc1(x))                                                 # 连接输入层到隐藏层,且使用激励函数ReLU来处理经过隐藏层后的值
        actions_value = self.out(x)                                             # 连接隐藏层到输出层,获得最终的输出值 (即动作值)
        return actions_value                                                    # 返回动作值


# 定义DQN类 (定义两个网络)
class DQN(object):
    def __init__(self):                                                         # 定义DQN的一系列属性
        self.eval_net, self.target_net = Net(), Net()                           # 利用Net创建两个神经网络: 评估网络和目标网络
        self.learn_step_counter = 0                                             # for target updating
        self.memory_counter = 0                                                 # for storing memory
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))             # 初始化记忆库,一行代表一个transition
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)    # 使用Adam优化器 (输入为评估网络的参数和学习率)
        self.loss_func = nn.MSELoss()                                           # 使用均方损失函数 (loss(xi, yi)=(xi-yi)^2)

    def choose_action(self, x):                                                 # 定义动作选择函数 (x为状态)
        x = torch.unsqueeze(torch.FloatTensor(x), 0)                            # 将x转换成32-bit floating point形式,并在dim=0增加维数为1的维度
        if np.random.uniform() < EPSILON:                                       # 生成一个在[0, 1)内的随机数,如果小于EPSILON,选择最优动作
            actions_value = self.eval_net.forward(x)                            # 通过对评估网络输入状态x,前向传播获得动作值
            action = torch.max(actions_value, 1)[1].data.numpy()                # 输出每一行最大值的索引,并转化为numpy ndarray形式
            action = action[0]                                                  # 输出action的第一个数
        else:                                                                   # 随机选择动作
            action = np.random.randint(0, N_ACTIONS)                            # 这里action随机等于0或1 (N_ACTIONS = 2)
        return action                                                           # 返回选择的动作 (0或1)

    def store_transition(self, s, a, r, s_):                                    # 定义记忆存储函数 (这里输入为一个transition)
        transition = np.hstack((s, [a, r], s_))                                 # 在水平方向上拼接数组
        # 如果记忆库满了,便覆盖旧的数据
        index = self.memory_counter % MEMORY_CAPACITY                           # 获取transition要置入的行数
        self.memory[index, :] = transition                                      # 置入transition
        self.memory_counter += 1                                                # memory_counter自加1

    def learn(self):                                                            # 定义学习函数(记忆库已满后便开始学习)
        # 目标网络参数更新
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:                  # 一开始触发,然后每100步触发
            self.target_net.load_state_dict(self.eval_net.state_dict())         # 将评估网络的参数赋给目标网络
        self.learn_step_counter += 1                                            # 学习步数自加1

        # 抽取记忆库中的批数据
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)            # 在[0, 2000)内随机抽取32个数,可能会重复
        b_memory = self.memory[sample_index, :]                                 # 抽取32个索引对应的32个transition,存入b_memory
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        # 将32个s抽出,转为32-bit floating point形式,并存储到b_s中,b_s为32行4列
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
        # 将32个a抽出,转为64-bit integer (signed)形式,并存储到b_a中 (之所以为LongTensor类型,是为了方便后面torch.gather的使用),b_a为32行1列
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
        # 将32个r抽出,转为32-bit floating point形式,并存储到b_s中,b_r为32行1列
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])
        # 将32个s_抽出,转为32-bit floating point形式,并存储到b_s中,b_s_为32行4列

        # 获取32个transition的评估值和目标值,并利用损失函数和优化器进行评估网络参数更新
        q_eval = self.eval_net(b_s).gather(1, b_a)
        # eval_net(b_s)通过评估网络输出32行每个b_s对应的一系列动作值,然后.gather(1, b_a)代表对每行对应索引b_a的Q值提取进行聚合
        q_next = self.target_net(b_s_).detach()
        # q_next不进行反向传递误差,所以detach;q_next表示通过目标网络输出32行每个b_s_对应的一系列动作值
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
        # q_next.max(1)[0]表示只返回每一行的最大值,不返回索引(长度为32的一维张量);.view()表示把前面所得到的一维张量变成(BATCH_SIZE, 1)的形状;最终通过公式得到目标值
        loss = self.loss_func(q_eval, q_target)
        # 输入32个评估值和32个目标值,使用均方损失函数
        self.optimizer.zero_grad()                                      # 清空上一步的残余更新参数值
        loss.backward()                                                 # 误差反向传播, 计算参数更新值
        self.optimizer.step()                                           # 更新评估网络的所有参数
    # 训练网络
    def train_network(self):
        reward_array = []
        mean_array = []
        for epsiode in range(EPSIODES):                                         # 400个episode循环
            s = env.reset()                                                     # 重置环境
            episode_reward_sum = 0
            learn_flag = 0
            # 初始化该循环对应的episode的总奖励
            while True:                                                         # 开始一个episode (每一个循环代表一步)
                a = self.choose_action(s)                                        # 输入该步对应的状态s,选择动作
                s_, r, done, info,_ = env.step(a)                               # 执行动作,获得反馈
                self.store_transition(s, a, r, s_)                               # 存储样本
                episode_reward_sum += r                                         # 逐步加上一个episode内每个step的reward
                s = s_                                                          # 更新状态
                if self.memory_counter > MEMORY_CAPACITY:                        # 如果累计的transition数量超过了BATCHSIZE就开始学习
                    # 开始学习 (抽取记忆,即32个transition,并对评估网络参数进行更新,并在开始学习后每隔100次将评估网络的参数赋给目标网络)
                    learn_flag += 1 
                    if learn_flag == 1:
                        print("--------开始epsiode={}时的学习--------".format(epsiode))
                    self.learn()
                if done:       # 如果done为True
                    break
            if learn_flag >= 1:
                learn_flag = 0
                print("--------结束epsiode={}时的学习--------".format(epsiode))
            reward_array.append(episode_reward_sum)
            mean_array.append(np.mean(reward_array))
            print("epsiode:{},奖励为:{:.2f},平均奖励为{:.2f}".format(epsiode,reward_array[-1],mean_array[-1]))
        plt.plot(mean_array)
# 主函数
if __name__ == "__main__":
    dqn = DQN()
    dqn.train_network()

结果如下:
在这里插入图片描述
在这里插入图片描述

  • 3
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
好的,以下是使用PyTorch和DQN算法来解决MountainCar问题的完整代码。需要注意的是,这是一个基本的实现,您可以根据需要进行修改和优化。 ```python import gym import random import math import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from collections import deque # 定义神经网络模型 class DQN(nn.Module): def __init__(self, num_inputs, num_actions, hidden_size): super(DQN, self).__init__() self.layers = nn.Sequential( nn.Linear(num_inputs, hidden_size), nn.ReLU(), nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, num_actions) ) def forward(self, x): return self.layers(x) # 定义经验回放缓冲区 class ReplayBuffer(object): def __init__(self, capacity): self.buffer = deque(maxlen=capacity) def push(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) return state, action, reward, next_state, done def __len__(self): return len(self.buffer) # 定义DQN算法 class DQNAgent(object): def __init__(self, env): self.env = env self.memory = ReplayBuffer(10000) self.batch_size = 128 self.gamma = 0.99 self.eps_start = 1.0 self.eps_end = 0.01 self.eps_decay = 500 self.target_update = 10 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.policy_net = DQN(env.observation_space.shape[0], env.action_space.n, 128).to(self.device) self.target_net = DQN(env.observation_space.shape[0], env.action_space.n, 128).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters()) self.steps_done = 0 def select_action(self, state): eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_done / self.eps_decay) self.steps_done += 1 if random.random() > eps_threshold: with torch.no_grad(): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) q_value = self.policy_net(state) action = q_value.max(1)[1].item() else: action = self.env.action_space.sample() return action def optimize_model(self): if len(self.memory) < self.batch_size: return state, action, reward, next_state, done = self.memory.sample(self.batch_size) state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.LongTensor(action).to(self.device) reward = torch.FloatTensor(reward).to(self.device) done = torch.FloatTensor(done).to(self.device) q_values = self.policy_net(state).gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.target_net(next_state).max(1)[0] expected_q_values = reward + (1 - done) * self.gamma * next_q_values loss = F.mse_loss(q_values, expected_q_values.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def train(self, num_episodes): for i_episode in range(num_episodes): state = self.env.reset() total_reward = 0 while True: action = self.select_action(state) next_state, reward, done, _ = self.env.step(action) self.memory.push(state, action, reward, next_state, done) state = next_state self.optimize_model() total_reward += reward if done: break if i_episode % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) print("Episode: {}, reward: {}".format(i_episode, total_reward)) # 训练模型 env = gym.make("MountainCar-v0") agent = DQNAgent(env) agent.train(1000) ``` 希望这个代码对你有所帮助!
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值