2021-10-24 7.7

class Monte_Carlo_Policy_Gradient():
    def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95):
        self.nA = env.action_space.n  # 动作空间数量
        self.nS = env.observation_space.shape[0]  # 状态空间数量
        self.env = env  # 环境
        self.num_episodes = num_episodes  # 经验轨迹迭代次数
        self.reward_decay = reward_decay  # 奖励衰减值
        self.learning_rate = learning_rate  # 网络学习率
        self.rewards = []  # 记录所有奖励
        self.RENDER_REWARD_WIN = 20  # 最小奖励阈值
        self.RENDER_ENV = False  # 是否重新分配环境标志位
        self.PG = PolicyGradient(n_x=self.nS, n_y=self.nA, learning_rate=self.learning_rate, reward_decay=self.reward_decay)
        record_head = namedtuple('states', ['episode_lengths', 'episode_rewards'])
        self.record = record_head(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))

    def mcpg_learn(self):
        for i_episode in range(self.num_episodes):  # 迭代经验轨迹次数
            state = self.env.reset()  # 初始化环境
            reward = 0  # 初始化奖励为0
            for t in itertools.count():  # 遍历经验轨迹
                if self.RENDER_ENV:  # 如果环境重置标志位位True,则对环境进行重置
                    self.env.render()
                action = self.PG.choose_action(state)  # 步骤1:根据给定状态,策略网络选择出相应动作
                next_state, reward, done, _ = self.env.step(action)  # 步骤2:环境执行动作给出反馈
                self.PG.store_transition(state, action, reward)  # 步骤3:记录环境反馈,用于策略网络的训练数据
                self.record.episode_rewards[i_episode] += reward
                self.record.episode_lengths[i_episode] = t
                if done:  # 游戏结束
                    episode_rewards_sum = sum(self.PG.episode_rewards)  # 计算本次经验轨迹所获得的累计奖励
                    self.rewards.append(episode_rewards_sum)
                    max_reward = np.amax(self.rewards)
                    self.PG.learn()  # 步骤4:结束游戏后对策略网络进行训练
                    if max_reward > self.RENDER_REWARD_WIN:  # 如果历史最大奖励大于奖励最小阈值,则重新环境标志位为True,主要作为在于增加环境的复杂性,让环境尽可能产生不一样的状态
                        self.RENDER_ENV = True
                    break  # 退出本次经验轨迹
                state = next_state  # 步骤5:存储下一个状态作为新的状态记录
        return self.record


class BPModel(nn.Module):
    def __init__(self, n_x, n_y):
        super(BPModel, self).__init__()
        self.layer1 = nn.Linear(n_x, 10)
        self.layer2 = nn.Linear(10, 10)
        self.layer3 = nn.Linear(10, n_y)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        output = nn.softmax(self.layer3(x))
        return output


class PolicyGradient():
    def __init__(self, n_x, n_y, learning_rate=0.01, reward_decay=0.95, load_path=None, save_path=None):
        self.n_x = n_x  # 策略网络输入
        self.n_y = n_y  # 策略网络输出
        self.lr = learning_rate  # 学习率
        self.reward_decay = reward_decay  # 折扣因子
        self.episode_states, self.episode_actions, self.episode_rewards = [], [], []  # 经验轨迹采样数据
        self.model = BPModel(n_x, n_y)  # 建立策略网络
        self.loss_func = nn.CrossEntropyLoss()  # 交叉熵损失
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def choose_action(self, state):
        state = state[:, np.newaxis]
        prob_action = self.model(state)  # 策略网络前向传播
        action = np.random.choice(range(len(prob_action.ravel())), p=prob_action.ravel())  # 根据动作概率选择执行动作
        return action

    def store_memory(self, state, action, reward):
        self.episode_states.append(state)
        self.episode_rewards.append(reward)
        action__ = np.zeros(self.n_y)
        action__[action] = 1
        self.episode_actions.append(action__)

    def learn(self):
        disc_ep_reward = np.zeros_like(self.episode_rewards)  # 计算每一步的状态价值
        running_add = 0
        for t in reversed(range(0, len(self.episode_rewards))):  # 从后往前算出每一步的状态价值
            running_add = running_add * self.reward_decay + self.episode_actions[t]
            disc_ep_reward[t] = running_add
        disc_ep_reward -= np.mean(disc_ep_reward)  # 减均值
        disc_ep_reward /= np.std(disc_ep_reward)  # 除以标准差
        X = np.vstack(self.episode_states)
        Y = np.vstack(self.episode_actions)
        outputs = self.model(X)  # 动作概率
        neg_log_prob = self.loss_func(outputs, Y)  # 本身这种算法不存在误差,此步思想是这次被选中的行为更有可能在下次发生
        loss = torch.mean(neg_log_prob * disc_ep_reward)  # 乘以标准化的奖励值,旨在确定这个行为是不是应当被增加被选的概率,相当于通过价值修正动作选择概率
        self.optimizer.zero_grad()
        loss.backward()
        self.episode_states, self.episode_actions, self.episode_rewards = [], [], []
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值