2021-10-25 7.18

class BPModel(nn.Module):
    def __init__(self, n_x, n_y):
        super(BPModel, self).__init__()
        self.layer1 = nn.Linear(n_x, 10)
        self.layer2 = nn.Linear(10, 10)
        self.layer3 = nn.Linear(10, n_y)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        output = self.layer3(x)
        return output


class PolicyGradient():
    def __init__(self, n_x, n_y, learning_rate=0.01, load_path=None, save_path=None):
        self.n_x = n_x  # 策略网络输入
        self.n_y = n_y  # 策略网络输出
        self.lr = learning_rate # 学习率
        self.episode_rewards = []  # 存放单次经验轨迹中的时间差分误差
        self.model = BPModel(n_x, n_y)  # 建立策略网络
        self.loss_func = nn.MSELoss()  # 平方差损失
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def store_transition(self, error):
        self.episode_rewards.append(error)  # 存储每次时间差分误差

    def predict(self, state):
        state = state[:, np.newaxis]
        prob_weights = self.model(state)
        prob_weights = nn.softmax(prob_weights)  # 获得动作概率
        action = np.random.choice(range(len(prob_weights.ravel())), p=prob_weights.ravel())
        return action

    def learn(self, state, action, reward):
        # disc_ep_reward = np.zeros_like(self.episode_rewards)  # 计算每一步的状态价值
        # disc_ep_reward -= np.mean(disc_ep_reward)  # 减均值
        # disc_ep_reward /= np.std(disc_ep_reward)  # 除以标准差

        action__ = np.zeros(self.n_y)
        action__[action] = 1
        # X = np.vstack(state)
        # Y = np.vstack(action__)
        X = state
        Y = action__
        outputs = self.model(X)  # 动作概率
        neg_log_prob = self.loss_func(outputs, Y)  # 本身这种算法不存在误差,此步思想是这次被选中的行为更有可能在下次发生
        loss = torch.mean(neg_log_prob * reward)  # 乘以评论家给出的时间差分误差(即对动作的评价),旨在确定这个行为是不是应当被增加被选的概率,相当于通过价值修正动作选择概率
        self.optimizer.zero_grad()
        loss.backward()
        self.episode_rewards = []


class ValueEstimator():
    def __init__(self, n_x, n_y,
                 learning_rate=0.01, load_path=None, save_path=None):
        self.n_x = n_x
        self.n_y = n_y
        self.lr = learning_rate
        self.episode_rewards = []
        self.model = BPModel(n_x, n_y)  # 建立策略网络
        self.loss_func = nn.MSELoss()  # 平方差损失
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def store_transition(self, target):
        self.episode_rewards.append(target)

    def predict(self, state):
        state = state[:, np.newaxis]
        # get softmax probabilities
        prob_weights = torch.relu(self.model(state))
        return prob_weights[0]

    def learn(self, state, target):
        disc_norm_ep_reward = self.__disc_and_norm_rewards()
        X = state[:, np.newaxis]
        Y = target
        outputs = self.model(X)  # 动作概率
        loss = self.loss_func(outputs, Y)  # 将目标价值(时间差分目标)作为标签值
        self.optimizer.zero_grad()
        loss.backward()
        self.episode_rewards = []


class Actor_Critic():
    def __init__(self, env, num_episodes=200, learning_rate=0.01, reward_decay=0.95):
        self.nA = env.action_space.n
        self.nS = env.observation_space.shape[0]
        self.nR = 1
        self.env = env
        self.num_episodes = num_episodes
        self.reward_decay = reward_decay
        self.learning_rate = learning_rate
        self.rewards = []
        self.RENDER_REWARD_MIN = 50  # 最小奖励阈值
        self.RENDER_ENV = False
        self.actor = PolicyGradient(n_x=self.nS, n_y=self.nA,
                                    learning_rate=self.learning_rate)
        self.critic = ValueEstimator(n_x=self.nS, n_y=self.nR,
                                     learning_rate=self.learning_rate)
        record_head = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
        self.record = record_head(
            episode_lengths=np.zeros(num_episodes),
            episode_rewards=np.zeros(num_episodes))

    def mcpg_learn(self):
        for i_episode in range(self.num_episodes):  # 经验轨迹迭代
            state = env.reset()  # 环境初始化
            reward_ = 0  # 轨迹总奖励初始化
            for t in itertools.count():  # 遍历经验轨迹
                action = self.actor.predict(state)  # 根据策略网络(actor)选择动作
                next_state, reward, done, _ = env.step(action)  # 执行动作
                reward_ += reward  # 总奖励累加
                value_next = self.critic.predict(next_state)  # 评论家预测下一状态的价值
                td_target = reward + self.reward_decay * value_next  # 时间差分目标
                td_error = td_target - self.critic.predict(state)  # 时间差分误差
                self.critic.learn(state, td_target)  # 更新价值网络(评论家)
                self.actor.learn(state, action, td_error)  # 更新策略网络(演员)
                self.record.episode_rewards[i_episode] += reward
                self.record.episode_lengths[i_episode] = t
                if done:
                    self.rewards.append(reward_)
                    max_reward = np.amax(self.rewards)
                    break
                state = next_state
        return self.record```

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值