Pytorch 实现强化学习策略梯度Reinforce算法

一、 公式推导

这里参考邱锡鹏大佬的《神经网络与深度学习》第三章进阶模型部分,链接《神经网络与深度学习》
`使用采样方法更新策略梯度
G值的计算表达式为
伪代码:
采用随机梯度下降的方式更新

二、核心代码

def main():

    env = gym.make('CartPole-v0')
    obs_n = env.observation_space.shape[0]
    act_n = env.action_space.n
    logger.info('obs_n {},act_n {}'.format(obs_n, act_n))

    model = Pgnet(obs_n, act_n)
    agent = Agent(net=model, obs_n=obs_n, act_n=act_n, lr=0.01, gamma=1.0)
    R = []
    Episode = []
    for j in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)
        obstotensor = torch.FloatTensor(batch_obs).view(len(batch_reward), -1)
        actiontotensor = torch.LongTensor(batch_action).view(len(batch_reward), )
        rewardtotensor = torch.FloatTensor(batch_reward).view(len(batch_reward), -1)
        for i in range(len(batch_reward)):
           obs = obstotensor[i,:]
           act= actiontotensor[i]
           reward = rewardtotensor[i]

           agent.learn(obs, act, reward)
        # if (i+1)%100 == 0:
        total_reward = evaluate(env, agent, render=True)
        print('episode%s---test_reward: %s' % (j, round(total_reward, 2)))
        R.append(total_reward)
        Episode.append(j)
    env.close()
    # 训练完毕保存网络参数
    torch.save(model.state_dict(), 'network_params.pth')
    fig, ax = plt.subplots()
    ax.plot(Episode, R, linewidth=3)
    ax.set_xlabel('epoch')
    ax.set_ylabel('testreward')
    plt.show()

三、全部代码

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
import gym
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 将得到的回报转换成G值

def calc_reward_to_go (reward_list ,gamma=0.9):
    # 返回每个t时刻的G值
    for i in range(len(reward_list) -2,-1,-1):
        #for以-1为倒叙生成数列
        reward_list[i] = gamma*reward_list[i+1]
    for i in range(len(reward_list)):
        reward_list[i] = gamma**i*reward_list[i]
    return np.array(reward_list)

# 定义policy网络

class Pgnet(nn.Module):
    # 这个网络是obs到action之间的映射
    def __init__(self, obs_n, act_n):
        super(Pgnet, self).__init__()
        self.linear1 = nn.Linear(obs_n, 24)
        self.linear2 = nn.Linear(24, 36)
        self.linear3 = nn.Linear(36, act_n)

    def forward(self, obs):
        x = F.tanh(self.linear1(obs))
        x = F.tanh(self.linear2(x))
        output = self.linear3(x)
        return output

# 定义智能体

class Agent(object):
    def __init__(self,net,obs_n, act_n,lr,gamma):
        self.model = net
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=lr)
        self.loss_function = nn.CrossEntropyLoss()
        self.obs_n = obs_n
        self.act_n = act_n

        self.gamma = gamma

    def sample(self,obs):
        #用于与环境互动时产生选择动作
        obs = torch.unsqueeze(torch.FloatTensor(obs), 0)                        # 送进网络之前要转换成张量,增加一维是因为有个batch在第一维度
        actions_prob = F.softmax(self.model.forward(obs))
        actions_prob = torch.squeeze(actions_prob, 0).data.numpy()
        act = np.random.choice(range(self.act_n), p=actions_prob)
        return act

    def predict(self,obs):
        # 在预测时选择概率最大的动作
        obs = torch.unsqueeze(torch.FloatTensor(obs), 0)                        # 送进网络之前要转换成张量,增加一维是因为有个batch在第一维度
        actions_prob = self.model.forward(obs)
        action = torch.max(actions_prob, 1)[1].data.numpy()                     # 输出每一行最大值的索引,并转化为numpy ndarray形式
        # TODO为啥是不是1
        action = action[0]
        return action

    def learn(self,obs, action, reward):
        obs = torch.unsqueeze(obs, 0)
        action = torch.unsqueeze(action,0)
        self.optimizer.zero_grad()
        act_prob = self.model(obs)
        # action_target = F.one_hot(action, num_classes=2)
        # 采用交叉损失熵计算Log
        log_prob = -1*self.loss_function(act_prob,action)
        loss = reward*log_prob
        loss.backward()
        return self.optimizer.step()

# 采集一轮数据的函数
def run_episode(env,agent):
    # 采集一个epside的数据
    obs_list, action_list, reward_list = [], [], []
    s = env.reset()
    #env.render()
    while True:
        a = agent.sample(s)  # 输入该步对应的状态s,选择动作
        s_, r, done, info = env.step(a)  # 执行动作,获得反馈

        # 修改奖励 (不修改也可以,修改奖励只是为了更快地得到训练好的摆杆)
        x, x_dot, theta, theta_dot = s_
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        new_r = r1 + r2
        obs_list.append(s)
        action_list.append(a)
        reward_list.append(new_r)
        s = s_
        if done:
            break
    return obs_list,action_list,reward_list

# 评估得到的policy网路
def evaluate(env, agent, render=False):
    # 评估训练的网路r
    eval_reward =[]
    for i in range(5):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if done:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


def main():

    env = gym.make('CartPole-v0')
    obs_n = env.observation_space.shape[0]
    act_n = env.action_space.n
    logger.info('obs_n {},act_n {}'.format(obs_n, act_n))

    model = Pgnet(obs_n, act_n)
    agent = Agent(net=model, obs_n=obs_n, act_n=act_n, lr=0.01, gamma=1.0)
    R = []
    Episode = []
    for j in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)
        obstotensor = torch.FloatTensor(batch_obs).view(len(batch_reward), -1)
        actiontotensor = torch.LongTensor(batch_action).view(len(batch_reward), )
        rewardtotensor = torch.FloatTensor(batch_reward).view(len(batch_reward), -1)
        for i in range(len(batch_reward)):
           obs = obstotensor[i,:]
           act= actiontotensor[i]
           reward = rewardtotensor[i]

           agent.learn(obs, act, reward)
        # if (i+1)%100 == 0:
        total_reward = evaluate(env, agent, render=True)
        print('episode%s---test_reward: %s' % (j, round(total_reward, 2)))
        R.append(total_reward)
        Episode.append(j)
    env.close()
    # 训练完毕保存网络参数
    torch.save(model.state_dict(), 'network_params.pth')
    fig, ax = plt.subplots()
    ax.plot(Episode, R, linewidth=3)
    ax.set_xlabel('epoch')
    ax.set_ylabel('testreward')
    plt.show()

if __name__== "__main__":
    main()

四、训练效果

下图为某次训练效果,PG算法的训练结果很不稳定,有时表现很好,有时表现一般,所以要多尝试几次。
在这里插入图片描述
在这里插入图片描述

  • 2
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 8
    评论
【资源说明】 Python基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zipPython基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zip Python基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zip Python基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zip Python基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zipPython基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zip Python基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zip Python基于深度强化学习的目的楼层预约调度算法的多智能体电梯群控系统设计源码+报告文档.zip 【备注】 1、该资源内项目代码都经过测试运行成功,功能ok的情况下才上传的,请放心下载使用! 2、本项目适合计算机相关专业(如计科、人工智能、通信工程、自动化、电子信息等)的在校学生、老师或者企业员工下载使用,也适合小白学习进阶,当然也可作为毕设项目、课程设计、作业、项目初期立项演示等。 3、如果基础还行,也可在此代码基础上进行修改,以实现其他功能,也可直接用于毕设、课设、作业等。 欢迎下载,沟通交流,互相学习,共同进步!
好的,下面是一个基于策略梯度下降的深度强化学习算法PyTorch实现。 首先我们需要导入相关的包和定义一些超参数: ```python import gym import torch import torch.nn as nn import torch.optim as optim import numpy as np # 超参数 learning_rate = 0.01 gamma = 0.99 hidden_layer = 128 batch_size = 32 episodes = 1000 render = False ``` 接下来,我们需要定义一个神经网络来估计策略。这里我们使用一个简单的全连接神经网络,输入是状态向量,输出是动作的概率分布。 ```python class Policy(nn.Module): def __init__(self, input_dim, output_dim, hidden_layer=128): super(Policy, self).__init__() self.fc1 = nn.Linear(input_dim, hidden_layer) self.fc2 = nn.Linear(hidden_layer, output_dim) def forward(self, x): x = torch.relu(self.fc1(x)) x = torch.softmax(self.fc2(x), dim=1) return x ``` 接下来,我们定义一些函数来计算策略梯度和更新策略。 ```python def compute_policy_gradient(policy, rewards, states, actions): # 计算策略梯度 R = 0 policy_gradient = [] for r in rewards[::-1]: R = r + gamma * R policy_gradient.insert(0, -R) policy_gradient = torch.tensor(policy_gradient) policy_gradient = (policy_gradient - policy_gradient.mean()) / (policy_gradient.std() + 1e-9) # 计算动作概率分布 states = torch.tensor(states, dtype=torch.float32) action_probs = policy(states) dist = torch.distributions.Categorical(action_probs) # 计算策略梯度乘以动作概率 actions = torch.tensor(actions) log_probs = dist.log_prob(actions) policy_gradient *= log_probs.exp() return policy_gradient def update_policy(policy, optimizer, policy_gradient): # 更新策略 optimizer.zero_grad() policy_gradient.mean().backward() optimizer.step() ``` 最后,我们定义主函数来运行整个强化学习算法。在每个回合结束后,我们先计算策略梯度,然后用策略梯度更新策略。 ```python if __name__ == '__main__': # 初始化环境和策略 env = gym.make('CartPole-v0') policy = Policy(env.observation_space.shape[0], env.action_space.n, hidden_layer=hidden_layer) optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # 训练策略 for episode in range(episodes): # 初始化回合 state = env.reset() done = False rewards = [] states = [] actions = [] # 运行回合 while not done: if render: env.render() # 选择动作 state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) action_probs = policy(state) dist = torch.distributions.Categorical(action_probs) action = dist.sample().item() # 执行动作 next_state, reward, done, _ = env.step(action) # 存储数据 states.append(state) actions.append(action) rewards.append(reward) # 更新状态 state = next_state # 计算策略梯度并更新策略 policy_gradient = compute_policy_gradient(policy, rewards, states, actions) update_policy(policy, optimizer, policy_gradient) # 打印训练进度 if episode % 10 == 0: print(f'Episode {episode}/{episodes}') env.close() ``` 这样我们就完成了基于策略梯度下降的深度强化学习算法PyTorch实现。您可以根据实际情况调整超参数和神经网络结构来获得更好的性能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值