使用ppo强化学习算法预测双色球彩票程序

本文链接：https://blog.csdn.net/weixin_42892543/article/details/116531439
注释：本程序仅用于学习参考，切勿用于其他用途。
使用百度parl框架搭建，在百度ai studio上成功运行。
#!/usr/bin/env python
# coding: utf-8
from parl.core.fluid.layers import layers
from parl.core.fluid import layers
import math
from gym import spaces, logger
from gym.utils import seeding
from paddle import fluid
from parl.algorithms.fluid import PPO  # 直接从parl库中导入DDQN算法，无需自己重写算法
import paddle
paddle.enable_static()
import gym
import numpy as np
import parl
from parl.utils import logger
from parl.utils.rl_utils import calc_gae, calc_discount_sum_rewards
import shutil
# 在执行四轴飞行器悬浮任务时,没有将4个电机的输出值统一的话训练不会收敛,改用CartPole
# 四轴飞行器悬浮任务改用终端执行训练
'''
envs='Quadrotor'
task='hovering_control'
'''

gamma = 0.9
lam = 0.98
kl_targ = 0.3
episodes_per_batch = 1000
loss_type = 'CLIP'
train_total_steps = 1e10
test_every_steps = 1e5


class RouletteEnv(gym.Env):
    """Simple roulette environment
    The roulette wheel has 37 spots. If the bet is 0 and a 0 comes up,
    you win a reward of 35. If the parity of your bet matches the parity
    of the spin, you win 1. Otherwise you receive a reward of -1.
    The long run reward for playing 0 should be -1/37 for any state
    The last action (38) stops the rollout for a return of 0 (walking away)
    """

    def __init__(self):
        # print("初始化环境")
        self.n = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
                  29, 30, 31, 32, 33]   # 设置34种动作
        self.action_space = len(self.n)
        self.seed()
        self.num = 0
        f = open('res.csv', 'r')
        data = f.readlines()
        f.close()
        allres = []
        for i in data:
            tmpc = i.strip().split(',')
            tmp = [int(i) for i in tmpc[1:]]
            allres.append(tmp)
        self.data = allres[:]
        self.nextNumber = self.data[1]
        self.number = self.data[0]
        self.select = []
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        for i in range(7):
            action[i]=int(action[i])
        red6 = action[:6]
        blue = action[-1]
        action = sorted(red6)
 
        action.append(blue)
        if (len(set(action))<7):
            return np.array(self.number), -2000, True, {
   }
        if(action[-1]>16):
            return np.array(self.number), -3000, True, {
   }
        # print(action)
        for i in range(7):
            self.select.append(int(action[i]))
        sz = self.rule(self.nextNumber,self.select)
        self.select = []
        reward = sz * 1000
        isOver = True
        # if reward > 0:
        #     isOver = True
        # else:
        #     isOver = False
        return np.array(self.number), reward, isOver, {
   }

    def reset(self):
        self.num += 1
        if (self.num + 1) >= len(self.data):
            print("结束一轮")
            self.__init__()
            self.num = 0
        self.number = self.data[self.num]
        self.nextNumber = self.data[self.num + 1]
        return np.array(self.number)

    def rule(self, z, select):
        allcount = -6
        d = z
        red = select[:6]
        blue = select[-1]
        zrnum = 0  # 中红球的数量
        zblue = False
        for r in red:
            if r in d[:6]:
                zrnum += 1
        if int(blue) == int(d[-1]):
            zblue = True
        # 1
        if zblue and zrnum == 6:
            allcount = 10

        # 2
        elif zrnum == 6:
            allcount = 5

        # 3
        elif zblue and zrnum == 5:
            allcount = 4

        # 4
        elif (zblue and zrnum == 4) or zrnum == 5:
            allcount = 3

        # 5
        elif (zblue and zrnum == 3) or zrnum == 4:
            allcount = 2

        # 6
        elif zblue:
            allcount = 1

        return allcount
def action_mapping(model_output_act, low_bound, high_bound):
    """ mapping action space [-1, 1] of model output
        to new action space [low_bound, high_bound].

    Args:
        model_output_act: np.array, which value is in [-1, 1]
        low_bound: float, low bound of env action space
        high_bound: float, high bound of env action space

    Returns:
        action: np.array, which value is in [low_bound, high_bound]
    """
    assert high_bound > low_bound
    action = low_bound + (model_output_act - (-1.0