2021-11-07

import torch


class SharedAdam(torch.optim.Adam):
    # params--待优化参数的iterable或者是定义了参数组的dict
    # lr--学习率(默认:1e-3)
    # betas–-用于计算梯度以及梯度平方的运行平均值的系数(默认:0.9,0.999)
    # eps–-为了增加数值计算的稳定性而加到分母里的项(默认:1e-8)
    # weight_decay--权重衰减(L2惩罚)(默认: 0)
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        # State initialization
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = torch.zeros_like(p.data)
                state['exp_avg_sq'] = torch.zeros_like(p.data)
                # 放入共享内存
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()
from torch import nn
import torch
import numpy as np


def v_wrap(np_array, dtype=np.float32):
    if np_array.dtype != dtype:
        np_array = np_array.astype(dtype)
    return torch.from_numpy(np_array)


def set_init(layers):
    for layer in layers:
        nn.init.normal_(layer.weight, mean=0., std=0.1)
        nn.init.constant_(layer.bias, 0.)


def push_and_pull(opt, lnet, gnet, done, s_, bs, ba, br, gamma):
    if done:
        v_s_ = 0.               # terminal
    else:
        v_s_ = lnet.forward(v_wrap(s_[None, :]))[-1].data.numpy()[0, 0]  # 得到状态价值
    buffer_v_target = []
    # 计算每个状态的目标值,并保存至buffer_v_target
    for r in br[::-1]:
        v_s_ = r + gamma * v_s_
        buffer_v_target.append(v_s_)
    buffer_v_target.reverse()  # 翻转至正常步序
    loss = lnet.loss_func(
        v_wrap(np.vstack(bs)),
        v_wrap(np.array(ba), dtype=np.int64) if ba[0].dtype == np.int64 else v_wrap(np.vstack(ba)),
        v_wrap(np.array(buffer_v_target)[:, None]))  # 通过存储的经验样本计算loss损失函数
    opt.zero_grad()  # 清楚工人梯度
    loss.backward()  # 反向传播
    # 将工人梯度传给全局模型
    for lp, gp in zip(lnet.parameters(), gnet.parameters()):
        gp._grad = lp.grad
    opt.step()  # 开始优化
    lnet.load_state_dict(gnet.state_dict())  # 将全局参数传给工人

# 主要负责记录
def record(global_ep, global_ep_r, ep_r, res_queue, name):
    with global_ep.get_lock():
        global_ep.value += 1
    with global_ep_r.get_lock():
        if global_ep_r.value == 0.:
            global_ep_r.value = ep_r
        else:
            global_ep_r.value = global_ep_r.value * 0.99 + ep_r * 0.01
    res_queue.put(global_ep_r.value)
    print(
        name,
        "Ep:", global_ep.value,
        "| Ep_r: %.0f" % global_ep_r.value,
    )
import torch
import torch.nn as nn
from utils import v_wrap, set_init, push_and_pull, record
import torch.nn.functional as F
import torch.multiprocessing as mp
from shared_adam import SharedAdam
import gym
import os

os.environ["OMP_NUM_THREADS"] = "1"  # OMP_NUM_THREADS=1基本上会关闭OpenMP多线程,因此每个Python进程都保持单线程,可拉满

UPDATE_GLOBAL_ITER = 5
GAMMA = 0.9  # 折扣因子
MAX_EP = 3000  # 最大经验数

env = gym.make('CartPole-v0')
N_S = env.observation_space.shape[0]  # 状态shape
N_A = env.action_space.n  # 动作shape


class Net(nn.Module):
    def __init__(self, s_dim, a_dim):
        super(Net, self).__init__()
        self.s_dim = s_dim  # N_S
        self.a_dim = a_dim  # N_A
        self.pi1 = nn.Linear(s_dim, 128)
        self.pi2 = nn.Linear(128, a_dim)
        self.v1 = nn.Linear(s_dim, 128)
        self.v2 = nn.Linear(128, 1)
        set_init([self.pi1, self.pi2, self.v1, self.v2])  # 初始化网络参数
        self.distribution = torch.distributions.Categorical  # 可根据传入值创建离散概率分布

    def forward(self, x):
        pi1 = torch.tanh(self.pi1(x))
        logits = self.pi2(pi1)  # 网络中策略部分
        v1 = torch.tanh(self.v1(x))
        values = self.v2(v1)  # 网络中价值部分
        return logits, values

    def choose_action(self, s):
        self.eval()
        logits, _ = self.forward(s)
        prob = F.softmax(logits, dim=1).data  # 按照softmax换算概率情况
        m = self.distribution(prob)  # 计算概率分布
        return m.sample().numpy()[0]  # 根据概率采样动作

    def loss_func(self, s, a, v_t):
        self.train()
        logits, values = self.forward(s)  # 得到策略输出和状态价值
        td = v_t - values  # TD_error
        c_loss = td.pow(2)  # 评论家损失为TD_error的平方

        probs = F.softmax(logits, dim=1)  # 按照softmax换算概率情况
        m = self.distribution(probs)  # 计算概率分布
        exp_v = m.log_prob(a) * td.detach().squeeze()  # 对动作a的概率进行log操作,并乘以TD_error,与AC算法思路一样,利用TD修正该动作
        a_loss = -exp_v  # 求相反数,方便用梯度下降法
        total_loss = (c_loss + a_loss).mean()  # 求网络总损失
        return total_loss
'''
# 连续动作
class Net(nn.Module):
    def __init__(self, s_dim, a_dim):
        super(Net, self).__init__()
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.a1 = nn.Linear(s_dim, 200)
        self.mu = nn.Linear(200, a_dim)
        self.sigma = nn.Linear(200, a_dim)
        self.c1 = nn.Linear(s_dim, 100)
        self.v = nn.Linear(100, 1)
        set_init([self.a1, self.mu, self.sigma, self.c1, self.v])
        self.distribution = torch.distributions.Normal  # 正态分布

    def forward(self, x):
        a1 = F.relu6(self.a1(x))
        mu = 2 * F.tanh(self.mu(a1))  # 均值
        sigma = F.softplus(self.sigma(a1)) + 0.001      # avoid 0,方差
        c1 = F.relu6(self.c1(x))
        values = self.v(c1)  # 价值
        return mu, sigma, values

    def choose_action(self, s):
        self.training = False
        mu, sigma, _ = self.forward(s)
        m = self.distribution(mu.view(1, ).data, sigma.view(1, ).data)  # 根据正态分布概率随机动作
        return m.sample().numpy()

    def loss_func(self, s, a, v_t):
        self.train()
        mu, sigma, values = self.forward(s)
        td = v_t - values
        c_loss = td.pow(2)

        m = self.distribution(mu, sigma)
        log_prob = m.log_prob(a)
        entropy = 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(m.scale)  # exploration,熵项
        exp_v = log_prob * td.detach() + 0.005 * entropy
        a_loss = -exp_v
        total_loss = (a_loss + c_loss).mean()
        return total_loss
'''


class Worker(mp.Process):
    def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name):
        super(Worker, self).__init__()
        self.name = 'w%02i' % name
        self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
        self.gnet, self.opt = gnet, opt
        self.lnet = Net(N_S, N_A)  # 工人网络
        self.env = gym.make('CartPole-v0').unwrapped  # 环境

    def run(self):
        total_step = 1   # 步数
        while self.g_ep.value < MAX_EP:
            s = self.env.reset()  # 初始状态
            buffer_s, buffer_a, buffer_r = [], [], []  # 状态,动作,奖励
            ep_r = 0.  # 总奖励
            while True:
                if self.name == 'w00':
                    self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))  # 转换数据类型
                s_, r, done, _ = self.env.step(a)  # 执行动作a
                '''
                s_, r, done, _ = self.env.step(a.clip(-2, 2))  # 连续动作
                '''
                if done:
                    r = -1
                ep_r += r
                # 存储a,s,r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # 当步数超过阈值或结束时,将工人更新到全局
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA)  # 先计算loss,然后计算梯度,将梯度给全局,用全局优化器训练,再将参数传给该工人
                    buffer_s, buffer_a, buffer_r = [], [], []  # 清空

                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name)
                        break
                s = s_
                total_step += 1  # 步数加一
        self.res_queue.put(None)
if __name__ == "__main__":
    gnet = Net(N_S, N_A)  # 全局网络
    gnet.share_memory()  # 将存储(参数)移动到共享内存
    opt = SharedAdam(gnet.parameters(), lr=1e-4, betas=(0.92, 0.999))  # global optimizer
    global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()  # 设置共享变量和队列

    workers = [Worker(gnet, opt, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())]  # 根据cpu核数设置工人数量
    [w.start() for w in workers]
    res = []  # record episode reward to plot
    while True:
        r = res_queue.get()  # 获得队列中结果
        if r is not None:
            res.append(r)
        else:
            break
    [w.join() for w in workers]

    import matplotlib.pyplot as plt

    plt.plot(res)
    plt.ylabel('Moving average ep reward')
    plt.xlabel('Step')
    plt.show()

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值