注释:本程序仅用于学习参考,切勿用于其他用途。
使用百度parl框架搭建,在百度ai studio上成功运行。
#!/usr/bin/env python
# coding: utf-8
from parl.core.fluid.layers import layers
from parl.core.fluid import layers
import math
from gym import spaces, logger
from gym.utils import seeding
from paddle import fluid
from parl.algorithms.fluid import PPO # 直接从parl库中导入DDQN算法,无需自己重写算法
import paddle
paddle.enable_static()
import gym
import numpy as np
import parl
from parl.utils import logger
from parl.utils.rl_utils import calc_gae, calc_discount_sum_rewards
import shutil
# 在执行四轴飞行器悬浮任务时,没有将4个电机的输出值统一的话训练不会收敛,改用CartPole
# 四轴飞行器悬浮任务改用终端执行训练
'''
envs='Quadrotor'
task='hovering_control'
'''
gamma = 0.9
lam = 0.98
kl_targ = 0.3
episodes_per_batch = 1000
loss_type = 'CLIP'
train_total_steps = 1e10
test_every_steps = 1e5
class RouletteEnv(gym.Env):
"""Simple roulette environment
The roulette wheel has 37 spots. If the bet is 0 and a 0 comes up,
you win a reward of 35. If the parity of your bet matches the parity
of the spin, you win 1. Otherwise you receive a reward of -1.
The long run reward for playing 0 should be -1/37 for any state
The last action (38) stops the rollout for a return of 0 (walking away)
"""
def __init__(self):
# print("初始化环境")
self.n = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33] # 设置34种动作
self.action_space = len(self.n)
self.seed()
self.num = 0
f = open('res.csv', 'r')
data = f.readlines()
f.close()
allres = []
for i in data:
tmpc = i.strip().split(',')
tmp = [int(i) for i in tmpc[1:]]
allres.append(tmp)
self.data = allres[:]
self.nextNumber = self.data[1]
self.number = self.data[0]
self.select = []
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self, action):
for i in range(7):
action[i]=int(action[i])
red6 = action[:6]
blue = action[-1]
action = sorted(red6)
action.append(blue)
if (len(set(action))<7):
return np.array(self.number), -2000, True, {
}
if(action[-1]>16):
return np.array(self.number), -3000, True, {
}
# print(action)
for i in range(7):
self.select.append(int(action[i]))
sz = self.rule(self.nextNumber,self.select)
self.select = []
reward = sz * 1000
isOver = True
# if reward > 0:
# isOver = True
# else:
# isOver = False
return np.array(self.number), reward, isOver, {
}
def reset(self):
self.num += 1
if (self.num + 1) >= len(self.data):
print("结束一轮")
self.__init__()
self.num = 0
self.number = self.data[self.num]
self.nextNumber = self.data[self.num + 1]
return np.array(self.number)
def rule(self, z, select):
allcount = -6
d = z
red = select[:6]
blue = select[-1]
zrnum = 0 # 中红球的数量
zblue = False
for r in red:
if r in d[:6]:
zrnum += 1
if int(blue) == int(d[-1]):
zblue = True
# 1
if zblue and zrnum == 6:
allcount = 10
# 2
elif zrnum == 6:
allcount = 5
# 3
elif zblue and zrnum == 5:
allcount = 4
# 4
elif (zblue and zrnum == 4) or zrnum == 5:
allcount = 3
# 5
elif (zblue and zrnum == 3) or zrnum == 4:
allcount = 2
# 6
elif zblue:
allcount = 1
return allcount
def action_mapping(model_output_act, low_bound, high_bound):
""" mapping action space [-1, 1] of model output
to new action space [low_bound, high_bound].
Args:
model_output_act: np.array, which value is in [-1, 1]
low_bound: float, low bound of env action space
high_bound: float, high bound of env action space
Returns:
action: np.array, which value is in [low_bound, high_bound]
"""
assert high_bound > low_bound
action = low_bound + (model_output_act - (-1.0