深度强化学习算法PPO训练Pendulum

本文介绍了一个基于PPO的深度强化学习模型,包含网络架构、数据处理、训练流程和环境集成。通过实例展示了如何在Pendulum-v1环境中应用该算法。
摘要由CSDN通过智能技术生成

PPO代码部分,训练连续动作

1.  导入必须要的包

import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.distributions import Normal
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
import gym
import copy

2.  定义actor网络和critic网络

class actor_net(nn.Module):

    def __init__(self, state_n, action_n, hidden_n):
        super(actor_net, self).__init__()
        self.fc1 = nn.Linear(state_n, hidden_n)
        self.fc2 = nn.Linear(hidden_n, hidden_n)

        self.mu = nn.Linear(hidden_n, action_n)
        self.sigma = nn.Linear(hidden_n, action_n)

    def forward(self, x):
        x = f.relu(self.fc1(x))
        x = f.relu(self.fc2(x))
        mu = torch.tanh(self.mu(x)) * 2
        sigma = f.softmax(self.sigma(x), dim = 1) + 0.001
        return mu, sigma


class critic_net(nn.Module):

    def __init__(self, state_n, hidden_n):
        super(critic_net, self).__init__()
        self.fc1 = nn.Linear(state_n, hidden_n)
        self.fc2 = nn.Linear(hidden_n, hidden_n)
        self.fc3 = nn.Linear(hidden_n, 1)

    def forward(self, x):
        x = f.relu(self.fc1(x))
        x = f.relu(self.fc2(x))
        value = self.fc3(x)
        return value

3.  定义存储数据的buffer,包括buffer添加数据,采样数据,清空数据

class buffer(object):
    def __init__(self, length):
        self.buffer_length = length
        self.buffer = deque(maxlen = self.buffer_length)
    def push(self, trans):
        self.buffer.append(trans)
    def sample(self):
        batch = list(self.buffer)
        return zip(*batch)
    def clear(self):
        self.buffer.clear()
    def length(self):
        return len(self.buffer)

4.  定义config

class config():
    def __init__(self):
        self.env_name = 'Pendulum-v1'
        self.train_eps = 10000
        self.test_eps = 20
        self.max_step = 200
        self.eval_eps = 5
        self.eval_per_ep = 10
        self.gamma = 0.99
        self.actor_lr = 2e-5
        self.critic_lr = 2e-5
        self.buffer_length = 128
        self.eps_clip = 0.2
        self.lam = 0.95
        self.batch_size = 128
        self.update_n = 8
        self.hidden_n = 256
        self.seed = 1
        self.device = 'cpu'

5.  定义PPO,包括根据状态采样动作,PPO更新

class PPO():

    def __init__(self, cfg):

        self.device = torch.device(cfg.device)
        self.old_actor = actor_net(cfg.state_n, cfg.action_n, cfg.hidden_n).to(self.device)

        self.old_critic = critic_net(cfg.state_n, cfg.hidden_n).to(self.device)
        self.new_actor = actor_net(cfg.state_n, cfg.action_n, cfg.hidden_n).to(self.device)
        self.new_critic = critic_net(cfg.state_n, cfg.hidden_n).to(self.device)

        self.actor_optim = optim.Adam(self.new_actor.parameters(), lr=cfg.actor_lr)
        self.critic_optim = optim.Adam(self.new_critic.parameters(), lr=cfg.critic_lr)
        self.memory = buffer(cfg.buffer_length)


    def sample_action(self, state):

        with torch.no_grad():
            state = torch.tensor(state, device=self.device).unsqueeze(dim=0)
            mu, sigma = self.old_actor(state)

            dist = Normal(mu, sigma)
            action = dist.sample()

        return [action.item()]


    def update(self):
        if self.memory.length() < cfg.batch_size:
            return
        states, actions, rewards, states_, dones = self.memory.sample()
        states = torch.tensor(np.array(states), device=self.device)
        actions = torch.tensor(np.array(actions), device=self.device)
        rewards = torch.tensor(np.array(rewards), device=self.device).reshape(-1, 1).float()
        states_ = torch.tensor(np.array(states_), device=self.device)
        dones = torch.tensor(np.array(dones), device=self.device).reshape(-1, 1)

        for _ in range(cfg.update_n):

            td_target = rewards + cfg.gamma * self.old_critic(states_) * ~dones
            td_target = td_target.float()
            mu, sigma = self.old_actor(states)
            old_dis = Normal(mu, sigma)
            old_log_prob = old_dis.log_prob(actions)

            td_error = rewards + cfg.gamma * self.new_critic(states_) * ~dones - self.new_critic(states)
            td_error = td_error.detach().numpy()
            advantage = []
            adv = 0
            for td in td_error[::-1]:
                adv = adv * cfg.gamma * cfg.lam + td[0]
                advantage.append(adv)
            advantage.reverse()

            advantage = torch.tensor(advantage).reshape(-1, 1)

            new_mu, new_sigma = self.new_actor(states)
            new_dist = Normal(new_mu, new_sigma)
            new_log_prob = new_dist.log_prob(actions)
            ratio = torch.exp(new_log_prob - old_log_prob)

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - cfg.eps_clip, 1 + cfg.eps_clip) * advantage
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = f.mse_loss(td_target.detach(), self.new_critic(states))

            self.actor_optim.zero_grad()
            self.critic_optim.zero_grad()
            actor_loss.backward()
            critic_loss.backward()
            self.actor_optim.step()
            self.critic_optim.step()
        self.old_critic.load_state_dict(self.new_critic.state_dict())
        self.old_actor.load_state_dict(self.new_actor.state_dict())
        self.memory.clear()

6.  定义环境和智能体

def get_env_agent(cfg):
    env = gym.make(cfg.env_name)
    state_n = env.observation_space.shape[0]
    action_n = env.action_space.shape[0]
    print('状态空间维度:', state_n)
    print('动作空间维度:', action_n)
    setattr(cfg, 'state_n', state_n)
    setattr(cfg, 'action_n', action_n)
    agent = PPO(cfg)
    return env, agent

7.  定义训练

def train(cfg, env, agent):
    print('train')
    rewards = []
    steps = []
    best_ep_reward = -10000
    output_agent = None
    for ep_i in range(cfg.train_eps):
        ep_reward = 0
        ep_step = 0
        state = env.reset(seed = cfg.seed)
        for _ in range(cfg.max_step):
            ep_step += 1
            action = agent.sample_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.memory.push((state, action, (reward + 8.1) / 8.1, next_state, done))
            state = next_state
            agent.update()
            ep_reward += reward
            if done:
                break
        if (ep_i + 1) % cfg.eval_per_ep == 0:
            sum_eval_reward = 0
            for _ in range(cfg.eval_eps):
                eval_ep_reward = 0
                state = env.reset()
                for _ in range(cfg.max_step):
                    action = agent.sample_action(state)
                    next_state, reward, done, _ = env.step(action)
                    state = next_state
                    eval_ep_reward += reward
                    if done:
                        break
                sum_eval_reward += eval_ep_reward
            mean_eval_reward = sum_eval_reward / cfg.eval_eps
            if mean_eval_reward > best_ep_reward:
                best_ep_reward = mean_eval_reward
                output_agent = copy.deepcopy(agent)
                print('train ep_i:%d/%d, rewards:%f, mean_eval_reward:%f, best_ep_reward:%f, update model'%(ep_i + 1, cfg.train_eps, ep_reward, mean_eval_reward, best_ep_reward))
            else:
                print('train ep_i:%d/%d, rewards:%f, mean_eval_reward:%f, best_ep_reward:%f'%(ep_i + 1, cfg.train_eps, ep_reward, mean_eval_reward, best_ep_reward))
        steps.append(ep_step)
        rewards.append(ep_reward)
    env.close()
    return output_agent, rewards

8.  定义测试

def test(cfg, env, agent):
    print('test')
    rewards = []
    steps = []
    for ep_i in range(cfg.test_eps):
        ep_reward = 0
        ep_step = 0
        state = env.reset()
        for _ in range(cfg.max_step):
            ep_step += 1
            action = agent.sample_action(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            ep_reward += reward
            if done:
                break
        steps.append(ep_step)
        rewards.append(ep_reward)
        print('test ep_i:%d, reward:%f'%(ep_i + 1, ep_reward))
    env.close()
    return rewards

9.  定义画图

def smooth(data, weight = 0.9):
    last = data[0]
    smoothed = []
    for point in data:
        smoothed_val = last * weight + (1 - weight) * point
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed

10.  main函数

if __name__ == '__main__':
    cfg = config()
    env, agent = get_env_agent(cfg)
    better_agent, train_rewards = train(cfg, env, agent)
    plt.figure()
    plt.title('training rewards')
    plt.plot(train_rewards, label='train_rewards')
    plt.plot(smooth(train_rewards), label='train_smooth_rewards')

    test_rewards = test(cfg, env, better_agent)
    plt.figure()
    plt.title('testing rewards')
    plt.plot(test_rewards, label='test_rewards')
    plt.plot(smooth(test_rewards), label='test_smooth_ewards')
    plt.show()

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
PPO(Proximal Policy Optimization)是一种强化学习算法,可以用于离散和连续动作空间。PPO的主要思想是在更新策略时,限制新策略与旧策略之间的差异,以确保更新后的策略不会太远离旧策略,从而保证学习的稳定性。PPO有两种形式:PPO-Penalty和PPO-Clip。 对于离散动作空间,PPO-Penalty使用KL散度来限制新策略与旧策略之间的差异,而PPO-Clip使用一个截断函数来限制差异。在PPO-Clip中,新策略与旧策略之间的差异被限制在一个固定的范围内,这个范围由一个超参数ε控制。 对于连续动作空间,PPO使用一个正态分布来表示策略,其中均值和方差是策略的参数。在训练过程中,PPO使用一个神经网络来拟合均值和方差,并使用拟合的正态分布进行抽样。PPO-Clip同样适用于连续动作空间,只需要将截断函数应用于均值的更新上即可。 下面是一个使用PPO算法解决倒立摆问题的示例代码: ```python import gym import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions import Normal # 定义神经网络 class ActorCritic(nn.Module): def __init__(self): super(ActorCritic, self).__init__() self.fc1 = nn.Linear(3, 64) self.fc2 = nn.Linear(64, 64) self.actor = nn.Linear(64, 1) self.critic = nn.Linear(64, 1) def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) actor = torch.tanh(self.actor(x)) * 2 # 将输出映射到[-2, 2]范围内 critic = self.critic(x) return actor, critic # 定义PPO算法 class PPO: def __init__(self): self.gamma = 0.99 self.lmbda = 0.95 self.eps_clip = 0.2 self.K = 10 self.actor_critic = ActorCritic() self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=0.001) def select_action(self, state): state = torch.FloatTensor(state.reshape(1, -1)) actor, _ = self.actor_critic(state) dist = Normal(actor, torch.ones(1, 1)) action = dist.sample() return action.item() def update(self, memory): states = torch.FloatTensor(memory.states) actions = torch.FloatTensor(memory.actions) old_log_probs = torch.FloatTensor(memory.log_probs) returns = torch.FloatTensor(memory.returns) advantages = torch.FloatTensor(memory.advantages) for _ in range(self.K): actor, critic = self.actor_critic(states) dist = Normal(actor, torch.ones(actor.size())) log_probs = dist.log_prob(actions) ratios = torch.exp(log_probs - old_log_probs) surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages actor_loss = -torch.min(surr1, surr2).mean() critic_loss = F.mse_loss(critic, returns) loss = actor_loss + 0.5 * critic_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # 训练PPO算法 env = gym.make('Pendulum-v0') ppo = PPO() memory = Memory() for i in range(1000): state = env.reset() done = False while not done: action = ppo.select_action(state) next_state, reward, done, _ = env.step([action]) memory.add(state, action, reward, next_state, done) state = next_state if i % 10 == 0: memory.calculate_returns(ppo.actor_critic, ppo.gamma, ppo.lmbda) ppo.update(memory) memory.clear() # 测试PPO算法 state = env.reset() done = False while not done: action = ppo.select_action(state) next_state, reward, done, _ = env.step([action]) env.render() state = next_state env.close() ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值