注释:本程序仅用于学习参考,切勿用于其他用途。
使用百度parl框架搭建,在百度ai studio上成功运行。
#!/usr/bin/env python
# coding: utf-8
from parl.core.fluid.layers import layers
from parl.core.fluid import layers
import math
from gym import spaces, logger
from gym.utils import seeding
from paddle import fluid
from parl.algorithms.fluid import PPO # 直接从parl库中导入DDQN算法,无需自己重写算法
import paddle
paddle.enable_static()
import gym
import numpy as np
import parl
from parl.utils import logger
from parl.utils.rl_utils import calc_gae, calc_discount_sum_rewards
import shutil
# 在执行四轴飞行器悬浮任务时,没有将4个电机的输出值统一的话训练不会收敛,改用CartPole
# 四轴飞行器悬浮任务改用终端执行训练
'''
envs='Quadrotor'
task='hovering_control'
'''
gamma = 0.9
lam = 0.98
kl_targ = 0.3
episodes_per_batch = 1000
loss_type = 'CLIP'
train_total_steps = 1e10
test_every_steps = 1e5
class RouletteEnv(gym.Env):
"""Simple roulette environment
The roulette wheel has 37 spots. If the bet is 0 and a 0 comes up,
you win a reward of 35. If the parity of your bet matches the parity
of the spin, you win 1. Otherwise you receive a reward of -1.
The long run reward for playing 0 should be -1/37 for any state
The last action (38) stops the rollout for a return of 0 (walking away)
"""
def __init__(self):
# print("初始化环境")
self.n = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33] # 设置34种动作
self.action_space = len(self.n)
self.seed()
self.num = 0
f = open('res.csv', 'r')
data = f.readlines()
f.close()
allres = []
for i in data:
tmpc = i.strip().split(',')
tmp = [int(i) for i in tmpc[1:]]
allres.append(tmp)
self.data = allres[:]
self.nextNumber = self.data[1]
self.number = self.data[0]
self.select = []
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self, action):
for i in range(7):
action[i]=int(action[i])
red6 = action[:6]
blue = action[-1]
action = sorted(red6)
action.append(blue)
if (len(set(action))<7):
return np.array(self.number), -2000, True, {}
if(action[-1]>16):
return np.array(self.number), -3000, True, {}
# print(action)
for i in range(7):
self.select.append(int(action[i]))
sz = self.rule(self.nextNumber,self.select)
self.select = []
reward = sz * 1000
isOver = True
# if reward > 0:
# isOver = True
# else:
# isOver = False
return np.array(self.number), reward, isOver, {}
def reset(self):
self.num += 1
if (self.num + 1) >= len(self.data):
print("结束一轮")
self.__init__()
self.num = 0
self.number = self.data[self.num]
self.nextNumber = self.data[self.num + 1]
return np.array(self.number)
def rule(self, z, select):
allcount = -6
d = z
red = select[:6]
blue = select[-1]
zrnum = 0 # 中红球的数量
zblue = False
for r in red:
if r in d[:6]:
zrnum += 1
if int(blue) == int(d[-1]):
zblue = True
# 1
if zblue and zrnum == 6:
allcount = 10
# 2
elif zrnum == 6:
allcount = 5
# 3
elif zblue and zrnum == 5:
allcount = 4
# 4
elif (zblue and zrnum == 4) or zrnum == 5:
allcount = 3
# 5
elif (zblue and zrnum == 3) or zrnum == 4:
allcount = 2
# 6
elif zblue:
allcount = 1
return allcount
def action_mapping(model_output_act, low_bound, high_bound):
""" mapping action space [-1, 1] of model output
to new action space [low_bound, high_bound].
Args:
model_output_act: np.array, which value is in [-1, 1]
low_bound: float, low bound of env action space
high_bound: float, high bound of env action space
Returns:
action: np.array, which value is in [low_bound, high_bound]
"""
assert high_bound > low_bound
action = low_bound + (model_output_act - (-1.0)) * (
(high_bound - low_bound) / 2.0)
return action
def gosmall(data,max,min):
_range = (max - min) / 2
res = (data-min) / _range
return res
class PPOModel(parl.core.fluid.Model):
def __init__(self, obs_dim, act_dim, init_logvar=-1.0):
self.policy_model = PolicyModel(obs_dim, act_dim, init_logvar)
self.value_model = ValueModel(obs_dim, act_dim)
self.policy_lr = self.policy_model.lr
self.value_lr = self.value_model.lr
def policy(self, obs):
return self.policy_model.policy(obs)
def policy_sample(self, obs):
return self.policy_model.sample(obs)
def value(self, obs):
return self.value_model.value(obs)
class PolicyModel(parl.core.fluid.Model):
def __init__(self, obs_dim, act_dim, init_logvar):
self.obs_dim = obs_dim
self.act_dim = act_dim
hid1_size = obs_dim * 100
hid3_size = act_dim * 100
hid2_size = int(np.sqrt(hid1_size * hid3_size))
self.lr = 9e-4 / np.sqrt(hid2_size)
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=hid2_size, act='tanh')
self.fc3 = layers.fc(size=hid3_size, act='tanh')
self.fc4 = layers.fc(size=act_dim, act='tanh')
self.logvars = layers.create_parameter(
shape=[act_dim],
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(
init_logvar))
def policy(self, obs):
hid1 = self.fc1(obs)
hid2 = self.fc2(hid1)
hid3 = self.fc3(hid2)
means = self.fc4(hid3)
logvars = self.logvars()
return means, logvars
def sample(self, obs):
means, logvars = self.policy(obs)
sampled_act = means + (
layers.exp(logvars / 2.0) * # stddev
layers.gaussian_random(shape=(self.act_dim, ), dtype='float32'))
return sampled_act
class ValueModel(parl.core.fluid.Model):
def __init__(self, obs_dim, act_dim):
super(ValueModel, self).__init__()
hid1_size = obs_dim * 100
hid3_size = 50
hid2_size = int(np.sqrt(hid1_size * hid3_size))
self.lr = 1e-2 / np.sqrt(hid2_size)
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=hid2_size, act='tanh')
self.fc3 = layers.fc(size=hid3_size, act='tanh')
self.fc4 = layers.fc(size=1)
def value(self, obs):
hid1 = self.fc1(obs)
hid2 = self.fc2(hid1)
hid3 = self.fc3(hid2)
V = self.fc4(hid3)
V = layers.squeeze(V, axes=[])
return V
class PPOAgent(parl.core.fluid.agent.Agent):
def __init__(self,
algorithm,
obs_dim,
act_dim,
kl_targ,
loss_type,
beta=1.0,
epsilon=0.2,
policy_learn_times=20,
value_learn_times=10,
value_batch_size=256):
self.alg = algorithm
self.obs_dim = obs_dim
self.act_dim = act_dim
assert loss_type == 'CLIP' or loss_type == 'KLPEN'
self.loss_type = loss_type
super(PPOAgent, self).__init__(algorithm)
self.policy_learn_times = policy_learn_times
# Adaptive kl penalty coefficient
self.beta = beta
self.kl_targ = kl_targ
self.value_learn_times = value_learn_times
self.value_batch_size = value_batch_size
self.value_learn_buffer = None
def build_program(self):
self.policy_predict_program = fluid.Program()
self.policy_sample_program = fluid.Program()
self.policy_learn_program = fluid.Program()
self.value_predict_program = fluid.Program()
self.value_learn_program = fluid.Program()
with fluid.program_guard(self.policy_sample_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
sampled_act = self.alg.sample(obs)
self.policy_sample_output = [sampled_act]
with fluid.program_guard(self.policy_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
means = self.alg.predict(obs)
self.policy_predict_output = [means]
with fluid.program_guard(self.policy_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
actions = layers.data(
name='actions', shape=[self.act_dim], dtype='float32')
advantages = layers.data(
name='advantages', shape=[1], dtype='float32')
if self.loss_type == 'KLPEN':
beta = layers.data(name='beta', shape=[], dtype='float32')
loss, kl = self.alg.policy_learn(obs, actions, advantages,
beta)
else:
loss, kl = self.alg.policy_learn(obs, actions, advantages)
self.policy_learn_output = [loss, kl]
with fluid.program_guard(self.value_predict_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
value = self.alg.value_predict(obs)
self.value_predict_output = [value]
with fluid.program_guard(self.value_learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
val = layers.data(name='val', shape=[], dtype='float32')
value_loss = self.alg.value_learn(obs, val)
self.value_learn_output = [value_loss]
def policy_sample(self, obs):
feed = {'obs': obs}
sampled_act = self.fluid_executor.run(
self.policy_sample_program,
feed=feed,
fetch_list=self.policy_sample_output)[0]
# print('policy_sample',sampled_act)
return sampled_act
def policy_predict(self, obs):
feed = {'obs': obs}
means = self.fluid_executor.run(
self.policy_predict_program,
feed=feed,
fetch_list=self.policy_predict_output)[0]
return means
def value_predict(self, obs):
feed = {'obs': obs}
value = self.fluid_executor.run(
self.value_predict_program,
feed=feed,
fetch_list=self.value_predict_output)
return value
#用ppo算法更新policy
def _batch_policy_learn(self, obs, actions, advantages):
if self.loss_type == 'KLPEN':
feed = {
'obs': obs,
'actions': actions,
'advantages': advantages,
'beta': self.beta
}
else:
feed = {'obs': obs, 'actions': actions, 'advantages': advantages}
[loss, kl] = self.fluid_executor.run(
self.policy_learn_program,
feed=feed,
fetch_list=self.policy_learn_output)
return loss, kl
#用ppo算法更新policy
def _batch_value_learn(self, obs, val):
feed = {'obs': obs, 'val': val}
value_loss = self.fluid_executor.run(
self.value_learn_program,
feed=feed,
fetch_list=self.value_learn_output)[0]
return value_loss
#用ppo算法更新policy
def policy_learn(self, obs, actions, advantages):
""" Learn policy:
1. Sync parameters of policy model to old policy model
2. Fix old policy model, and learn policy model multi times
3. if use KLPEN loss, Adjust kl loss coefficient: beta
"""
self.alg.sync_old_policy()
all_loss, all_kl = [], []
for _ in range(self.policy_learn_times):
loss, kl = self._batch_policy_learn(obs, actions, advantages)
# print(loss)
all_loss.append(loss)
all_kl.append(kl)
if self.loss_type == 'KLPEN':
# Adative KL penalty coefficient
if kl > self.kl_targ * 2:
self.beta = 1.5 * self.beta
elif kl < self.kl_targ / 2:
self.beta = self.beta / 1.5
return np.mean(all_loss), np.mean(all_kl)
#用ppo算法更新value
def value_learn(self, obs, value):
""" Fit model to current data batch + previous data batch
"""
data_size = obs.shape[0]
if self.value_learn_buffer is None:
obs_train, value_train = obs, value
else:
obs_train = np.concatenate([obs, self.value_learn_buffer[0]])
value_train = np.concatenate([value, self.value_learn_buffer[1]])
self.value_learn_buffer = (obs, value)
all_loss = []
for _ in range(self.value_learn_times):
random_ids = np.arange(obs_train.shape[0])
np.random.shuffle(random_ids)
shuffle_obs_train = obs_train[random_ids]
shuffle_value_train = value_train[random_ids]
start = 0
while start < data_size:
end = start + self.value_batch_size
value_loss = self._batch_value_learn(
shuffle_obs_train[start:end, :],
shuffle_value_train[start:end])
all_loss.append(value_loss)
start += self.value_batch_size
return np.mean(all_loss)
class Scaler(object):
""" Generate scale and offset based on running mean and stddev along axis=0
offset = running mean
scale = 1 / (stddev + 0.1) / 3 (i.e. 3x stddev = +/- 1.0)
"""
def __init__(self, obs_dim):
"""
Args:
obs_dim: dimension of axis=1
"""
self.vars = np.zeros(obs_dim)
self.means = np.zeros(obs_dim)
self.cnt = 0
self.first_pass = True
def update(self, x):
""" Update running mean and variance (this is an exact method)
Args:
x: NumPy array, shape = (N, obs_dim)
see: https://stats.stackexchange.com/questions/43159/how-to-calculate-pooled-
variance-of-two-groups-given-known-group-variances-mean
"""
if self.first_pass:
self.means = np.mean(x, axis=0)
self.vars = np.var(x, axis=0)
self.cnt = x.shape[0]
self.first_pass = False
else:
n = x.shape[0]
new_data_var = np.var(x, axis=0)
new_data_mean = np.mean(x, axis=0)
new_data_mean_sq = np.square(new_data_mean)
new_means = (
(self.means * self.cnt) + (new_data_mean * n)) / (self.cnt + n)
self.vars = (((self.cnt * (self.vars + np.square(self.means))) +
(n * (new_data_var + new_data_mean_sq))) /
(self.cnt + n) - np.square(new_means))
self.vars = np.maximum(
0.0, self.vars) # occasionally goes negative, clip
self.means = new_means
self.cnt += n
def get(self):
""" returns 2-tuple: (scale, offset) """
return 1 / (np.sqrt(self.vars) + 0.1) / 3, self.means
# 训练episode
def run_train_episode(env, agent,scaler):
obs = env.reset()
# print("obs = ", obs)
observes, actions, rewards, unscaled_obs = [], [], [], []
step = 1.0
scale, offset = scaler.get()
scale[-1] = 1.0 # don't scale time step feature
offset[-1] = 0.0 # don't offset time step feature
while True:
obs = obs.reshape((1, -1))
obs = np.append(obs, [[step]], axis=1) # add time step feature
unscaled_obs.append(obs)
obs = (obs - offset) * scale # center and scale observations
obs = obs.astype('float32')
observes.append(obs)
reward = 0
# try:
action0 = agent.policy_sample(obs)
# action0 = np.argmax(action0[-1])
action1 = np.clip(np.random.normal(action0, 1.0), -1.0, 1.0)
# action = np.clip(action, -1.0, 1.0)
# 之前在训练四轴飞行器时经常会报错,但是这里又不打印出越界的值..
# for i in range(len(action1[0])):
#
# if action1[0][i] > 1:
# print('大于1', action1[0][i])
# elif action1[0][i] < -1:
# print('小于-1', action1[0][i])
# 用了clip这里还可能会报错
action2 = action_mapping(action1, 1, 33)
action1 = action1.reshape((1, -1)).astype('float32')
# print("action = ",action)
actions.append(action1)
action = action2.reshape((1, -1)).astype('float32')
obs, reward, done, _ = env.step(np.squeeze(action))
# print(obs, reward, done, _ )
# reward = np.clip(reward, -1.0, 1.0)
# reward = gosmall(reward,17721088,-17721088)
# except Exception as e:
# print(e)
# print("action 出错--,action0={},action1={},action2={},action={}".format(action0,action1,action2,action))
rewards.append(reward)
step += 1e-3 # increment time step feature
if done:
break
return (np.concatenate(observes), np.concatenate(actions),
np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs))
# 验证
def run_evaluate_episode(env, agent,scaler):
obs = env.reset()
print("验证")
rewards = []
step = 0.0
scale, offset = scaler.get()
scale[-1] = 1.0 # don't scale time step feature
offset[-1] = 0.0 # don't offset time step feature
# while True:
obs = obs.reshape((1, -1))
obs = np.append(obs, [[step]], axis=1) # add time step feature
obs = (obs - offset) * scale # center and scale observations
obs = obs.astype('float32')
# try:
action0 = agent.policy_sample(obs)
# action0 = np.argmax(action0[-1])
action1 = np.clip(action0, -1.0, 1.0)
# action = np.clip(action, -1.0, 1.0)
# 之前在训练四轴飞行器时经常会报错,但是这里又不打印出越界的值..
# for i in range(len(action1[0])):
#
# if action1[0][i] > 1:
# print('大于1', action1[0][i])
# elif action1[0][i] < -1:
# print('小于-1', action1[0][i])
# 用了clip这里还可能会报错
action2 = action_mapping(action1, 1, 33)
action = action2.reshape((1, -1)).astype('float32')
# actions.append(action)
obs, reward, done, _ = env.step(np.squeeze(action))
# reward = np.clip(reward, -1.0, 1.0)
# reward = gosmall(reward,17720188,-17720188)
# except Exception as e:
# print(e)
# print("action 出错--,action0={},action1={},action2={},action={}".format(action0, action1, action2, action))
#
# action = agent.policy_predict(obs)
# action = np.clip(action, -1.0, 1.0)
# # for i in range(len(action[0])):
# #
# # if action[0][i] > 1:
# # print('大于1', action[0][i])
# # elif action[0][i] < -1:
# # print('小于-1', action[0][i])
# action = action_mapping(action, 1,17721088)
#
# obs, reward, done, _ = env.step(np.squeeze(action))
rewards.append(reward)
step += 1e-3 # increment time step feature
# if done :
# break
return np.sum(rewards)
# 收集trajectories中的数据
def collect_trajectories(env, agent, scaler, episodes):
logger.info("收集训练数据={}".format(episodes))
trajectories, all_unscaled_obs = [], []
for e in range(episodes):
obs, actions, rewards, unscaled_obs = run_train_episode(
env, agent,scaler)
trajectories.append({
'obs': obs,
'actions': actions,
'rewards': rewards,
})
all_unscaled_obs.append(unscaled_obs)
# update running statistics for scaling observations
logger.info("收集训练数据结束")
scaler.update(np.concatenate(all_unscaled_obs))
return trajectories
# 产生训练数据
def build_train_data(trajectories, agent):
train_obs, train_actions, train_advantages, train_discount_sum_rewards = [], [], [], []
for trajectory in trajectories:
pred_values = agent.value_predict(trajectory['obs'])
# print(pred_values)
# scale rewards
scale_rewards = trajectory['rewards'] * (1 - gamma)
discount_sum_rewards = calc_discount_sum_rewards(
scale_rewards, gamma).astype('float32')
# print(pred_values)
advantages = calc_gae(scale_rewards, pred_values, 0, gamma,
lam)
# normalize advantages
advantages = (advantages - advantages.mean()) / (
advantages.std() + 1e-6)
advantages = advantages.astype('float32')
train_obs.append(trajectory['obs'])
train_actions.append(trajectory['actions'])
train_advantages.append(advantages)
train_discount_sum_rewards.append(discount_sum_rewards)
train_obs = np.concatenate(train_obs)
train_actions = np.concatenate(train_actions)
train_advantages = np.concatenate(train_advantages)
train_discount_sum_rewards = np.concatenate(train_discount_sum_rewards)
# print("train_obs={}, train_actions={}, train_advantages={}, train_discount_sum_rewards={}".format(train_obs, train_actions, train_advantages, train_discount_sum_rewards))
return train_obs, train_actions, train_advantages, train_discount_sum_rewards
def main(index_model):
# env = ContinuousCartPoleEnv()
env = RouletteEnv()
# env = make_env(envs, task=task)
obs_dim = 7
act_dim = 7
obs_dim += 1 # add 1 to obs dim for time step feature 应该是为了方便引入衰减因子
scaler = Scaler(obs_dim)
model = PPOModel(obs_dim, act_dim)
alg = PPO(
model,
act_dim=act_dim,
policy_lr=model.policy_lr,
value_lr=model.value_lr)
agent = PPOAgent(
alg, obs_dim, act_dim, kl_targ, loss_type=loss_type)
# 运行几个episode来初始化 scaler
logger.info("预存数据")
collect_trajectories(env, agent, scaler,episodes=500)
logger.info("预存数据结束")
test_flag = 0
total_steps = 0
# 重新加载模型
# index_model = index_model
# agent.restore('./ormodel_dir/{}/policy_steps_{}.ckpt'.format(index_model,index_model),agent.policy_learn_program)
# agent.restore('./ormodel_dir/{}/volicy_steps_{}.ckpt'.format(index_model,index_model), agent.value_learn_program)
# print('restore ckpt success')
logger.info("train_total_steps={}".format(train_total_steps))
while total_steps < train_total_steps:
trajectories = collect_trajectories(
env, agent, scaler,episodes=episodes_per_batch)
total_steps += sum([t['obs'].shape[0] for t in trajectories])
total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])
# 产生训练数据
# logger.info("产生训练数据")
train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
trajectories, agent)
# 计算policy_loss, kl
# logger.info("开始学习")
policy_loss, kl = agent.policy_learn(train_obs, train_actions,
train_advantages)
value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)
logger.info(
'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
.format(total_steps, total_train_rewards / episodes_per_batch,
policy_loss, kl, value_loss))
if total_steps // test_every_steps >= test_flag:
while total_steps // test_every_steps >= test_flag:
test_flag += 1
eval_reward = run_evaluate_episode(env, agent,scaler)
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, eval_reward))
print("保存模型_",str(int(total_steps/100000)))
# 每评估一次,就保存一次模型,以训练的step数命名
pckpt = 'ormodel_dir/{}/policy_steps_{}.ckpt'.format(int(total_steps/100000),int(total_steps/100000))
agent.save(pckpt, agent.policy_learn_program)
vckpt = 'ormodel_dir/{}/volicy_steps_{}.ckpt'.format(int(total_steps/100000),int(total_steps/100000))
agent.save(vckpt, agent.value_learn_program)
tmp_m = int(total_steps/100000)
try:
if(tmp_m > 5):
path = './ormodel_dir/'+str(tmp_m-5)
shutil.rmtree(path)
except:
print("删除失败",tmp_m)
if __name__ == "__main__":
main(12)