这一部分的代码都来自莫凡,由于所看的书《白话强化学习》里面的代码块有一部分看不懂,转而看其他老师的代码,感觉莫老师的代码通俗易懂,但是语法可能和书上所学的有所不一样,所以还是读了这个代码,作了点注释,以供之后翻阅
main.py
"""
Reinforcement Learning (A3C) using Pytroch + multiprocessing.
The most simple implementation for continuous action.
View more on my Chinese tutorial page [莫烦Python](https://morvanzhou.github.io/).
"""
import torch
import torch.nn as nn
from utils import v_wrap, set_init, push_and_pull, record
import torch.nn.functional as F
import torch.multiprocessing as mp
from shared_adam import SharedAdam
import gym
import math, os
os.environ["OMP_NUM_THREADS"] = "1"
UPDATE_GLOBAL_ITER = 5
GAMMA = 0.9
MAX_EP = 3000
MAX_EP_STEP = 200
env = gym.make('Pendulum-v0')
N_S = env.observation_space.shape[0]
# 环境观测空间
N_A = env.action_space.shape[0]
# 环境动作空间
class Net(nn.Module):
def __init__(self, s_dim, a_dim):
super(Net, self).__init__()
self.s_dim = s_dim
self.a_dim = a_dim
self.a1 = nn.Linear(s_dim, 200)
self.mu = nn.Linear(200, a_dim)
self.sigma = nn.Linear(200, a_dim)
self.c1 = nn.Linear(s_dim, 100)
self.v = nn.Linear(100, 1)
set_init([self.a1, self.mu, self.sigma, self.c1, self.v])
# 将参数初始化
self.distribution = torch.distributions.Normal
def forward(self, x):
a1 = F.relu6(self.a1(x))
mu = 2 * F.tanh(self.mu(a1))
sigma = F.softplus(self.sigma(a1)) + 0.001 # avoid 0
c1 = F.relu6(self.c1(x))
values = self.v(c1)
return mu, sigma, values
def choose_action(self, s):
self.training = False
mu, sigma, _ = self.forward(s)
m = self.distribution(mu.view(1, ).data, sigma.view(1, ).data)
# distribution分布中选取动作,对应后面的动作的概率,可以直接得到
return m.sample().numpy()
def loss_func(self, s, a, v_t):
self.train()
mu, sigma, values = self.forward(s)
td = v_t - values
c_loss = td.pow(2) # critic 的误差函数
m = self.distribution(mu, sigma)
# 生成正态分布
log_prob = m.log_prob(a)
# 动作的可能性
#log_prob*td代表动作价值期望
entropy = 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(m.scale) #添加误差,增加探索性,相当于采用了贪心算法 exploration
exp_v = log_prob * td.detach() + 0.005 * entropy
a_loss = -exp_v
total_loss = (a_loss + c_loss).mean()
return total_loss
class Worker(mp.Process):
def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name):
super(Worker, self).__init__()
self.name = 'w%i' % name
self.g_ep, self.g_ep_r, self.res_queue = global_ep, global_ep_r, res_queue
self.gnet, self.opt = gnet, opt
self.lnet = Net(N_S, N_A) # local network
self.env = gym.make('Pendulum-v0').unwrapped
def run(self):
# 在mp文件中有run这个函数,应该是被改写了,进行进程时自动开启运行这个子类。
# 具体的可以参考这个博客:https://blog.csdn.net/zhangxuelong461/article/details/104059149,解释的很清楚,以后都可以这样用
total_step = 1
while self.g_ep.value < MAX_EP:
s = self.env.reset()
buffer_s, buffer_a, buffer_r = [], [], []
ep_r = 0.
for t in range(MAX_EP_STEP):
if self.name == 'w0':
self.env.render()
a = self.lnet.choose_action(v_wrap(s[None, :]))
# [None, :]代表增加1个维度
s_, r, done, _ = self.env.step(a.clip(-2, 2))
if t == MAX_EP_STEP - 1:
done = True
ep_r += r
buffer_a.append(a)
buffer_s.append(s)
buffer_r.append((r+8.1)/8.1) # normalize
if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net
# 当游戏结束或这到达更新全局参数次数整数倍时进行更新参数
# sync
push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA)
buffer_s, buffer_a, buffer_r = [], [], []
if done: # done and print information
record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name)
break
s = s_
total_step += 1
self.res_queue.put(None)
if __name__ == "__main__":
gnet = Net(N_S, N_A) # global network
gnet.share_memory() # share the global parameters in multiprocessing
opt = SharedAdam(gnet.parameters(), lr=1e-4, betas=(0.95, 0.999)) # global optimizer
global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()
# parallel training
workers = [Worker(gnet, opt, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())]
[w.start() for w in workers]
res = [] # record episode reward to plot
while True:
r = res_queue.get()
if r is not None:
res.append(r)
else:
break
[w.join() for w in workers]
import matplotlib.pyplot as plt
plt.plot(res)
plt.ylabel('Moving average ep reward')
plt.xlabel('Step')
plt.show()
utils.py
"""
Functions that use multiple times
"""
from torch import nn
import torch
import numpy as np
def v_wrap(np_array, dtype=np.float32):
if np_array.dtype != dtype:
np_array = np_array.astype(dtype)
return torch.from_numpy(np_array)
def set_init(layers):
for layer in layers:
# 将参数初始化
nn.init.normal_(layer.weight, mean=0., std=0.1)
nn.init.constant_(layer.bias, 0.)
def push_and_pull(opt, lnet, gnet, done, s_, bs, ba, br, gamma):
if done:
v_s_ = 0. # terminal
else:
v_s_ = lnet.forward(v_wrap(s_[None, :]))[-1].data.numpy()[0, 0]
# 取出下一个状态s'下values价值数值
buffer_v_target = []
for r in br[::-1]: # reverse buffer r
v_s_ = r + gamma * v_s_
# n步折扣价值函数
buffer_v_target.append(v_s_)
buffer_v_target.reverse() # 反向列表元素
loss = lnet.loss_func(
v_wrap(np.vstack(bs)),
v_wrap(np.array(ba), dtype=np.int64) if ba[0].dtype == np.int64 else v_wrap(np.vstack(ba)),
v_wrap(np.array(buffer_v_target)[:, None]))
# v_wrap()函数是用来将类型转换
# calculate local gradients and push local parameters to global
opt.zero_grad()
loss.backward()
for lp, gp in zip(lnet.parameters(), gnet.parameters()):
gp._grad = lp.grad
# 将刚刚计算的梯度传递给全局网路
opt.step()
# pull global parameters
lnet.load_state_dict(gnet.state_dict())
def record(global_ep, global_ep_r, ep_r, res_queue, name):
with global_ep.get_lock():
global_ep.value += 1
with global_ep_r.get_lock():
if global_ep_r.value == 0.:
global_ep_r.value = ep_r
else:
global_ep_r.value = global_ep_r.value * 0.99 + ep_r * 0.01
res_queue.put(global_ep_r.value)
print(
name,
"Ep:", global_ep.value,
"| Ep_r: %.0f" % global_ep_r.value,
)