pytorch+从代码详细理解强化学习PolicyGradient

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import collections
import random
from torch import optim

class PolicyGradient(nn.Module):
    def __init__(self, n_actions, n_inputs, lr=0.01):
        super(PolicyGradient, self).__init__()
        self.fc1 = nn.Linear(n_inputs, 64)  # 输入维度为4
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, n_actions)  # 输出维度为2

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = 'cpu'
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = F.softmax(self.fc3(x), dim=1)     # PG要用交叉熵损失,必须在行维度转softmax
        return actions


class Agent:
    def __init__(self, env, n_actions, state_n_dim, gamma=0.99):
        self.env = env
        self.n_actions = n_actions
        self.state_n_dim = state_n_dim
        self.gamma = gamma  # 未来值影响参数
        self.eps_min = 0.05  # epsilon会衰减,但不低于eps_min
        self.eps_dec = 5e-4  # 每次衰减0.0005
        self.policy_net = PolicyGradient(self.n_actions, self.state_n_dim)    # 待学习的网络
        self.scores = []    # 记录每个episode的得分
        self.loss = 0
        self.state_list, self.action_list, self.G_reward_list = [], [], []

    '''选择动作,这个动作不是根据Q值来选择,而是使用softmax生成的概率来选,
    不同于DQN--不需要epsilon-greedy,因为概率本身就具有随机性'''
    def choose_action(self, state):
        with torch.no_grad():
            q = self.policy_net(torch.tensor([state], dtype=torch.float32))
            '''我运行时提示np.random.choice()的q和不为1,所以把q做了处理,好像和numpy版本有关!'''
            # print('处理前的网络输出q:',q)
            q = np.squeeze(q)
            q = np.array(q)
            q /= q.sum()
            # print('处理后的网络输出q:',q)
        action = np.random.choice(self.n_actions, p=q)  # 返回下标
        # print('根据网络输出,按概率选择action:', action)
        return action

    def calc_reward_to_go(self, reward_list, gamma=0.6):  # r_j转G_j当前时刻未来总收益
        for i in range(len(reward_list) - 2, -1, -1):
            # G_i = r_i + γ·G_i+1
            reward_list[i] += gamma * reward_list[i + 1]  # Gt
        return reward_list

    def learn(self):  # 完成一个episode
        done = False
        reward_list = []    # 记录每一步的reward
        state = self.env.reset()  # [-0.00797301  0.02779841 -0.04731911  0.03995738]
        # print('游戏开始,得到一个状态state:', state)
        score = 0  # 记录一个episode的总得分
        while not done:
            action = self.choose_action(state)  # 返回最大值对应的下标,作为action,如 0
            next_state, reward, done, _ = self.env.step(action)
            # print('环境交互得到next_s:', next_state, 'reward: ', reward, 'done: ', done)
            score += reward
            self.state_list.append(state)
            self.action_list.append(action)
            reward_list.append(reward)
            state = next_state

        self.scores.append(score)
        self.G_reward_list = self.calc_reward_to_go(reward_list)
        # print('G_reward_list:', self.G_reward_list)

        self.policy_net.optimizer.zero_grad()

        state_batch = torch.tensor(self.state_list, dtype=torch.float32)
        action_batch = torch.tensor(self.action_list)  # tensor([0, 1, 1, 1, 0,...])
        reward_batch = torch.tensor(self.G_reward_list)

        pred_list = self.policy_net(state_batch)  # 预测值tensor([0.8900, 0.9800, 0.7500])
        # print('pred_list: ', pred_list)
        '''F.crossentrop(pre,target)中pre是未经过softmax的,这里pred_list经过softmax处理了,所以使用nll_loss(),其中,nll_loss()会自动取负值'''
        loss = F.nll_loss(pred_list.log(), action_batch, reduction='none')  # 直接返回loss,默认取平均
        # print('loss: ',loss)
        loss = torch.mean(loss * reward_batch)
        # print('加上权重后取平均的loss: ', loss)
        self.loss = loss.item()
        loss.backward()  # 反向计算梯度
        self.policy_net.optimizer.step()  # 梯度传播,更新参数
        self.state_list, self.action_list, self.G_reward_list = [], [], []  # 清空

    def save_model(self):
        torch.save(self.policy_net.state_dict(), 'pg_net.pth')  # 保存模型参数

if __name__ == '__main__':
    round_count = 5  # 跑5次的结果取平均
    round_all_score = 0
    env = gym.make('CartPole-v0')
    n_actions = env.action_space.n  # 可选动作有2个
    state_n_dims = env.observation_space.shape[0]  # 状态的维度为4
    for i in range(round_count):
        agent = Agent(env, n_actions, state_n_dims)
        episodes = 1000 # 每次跑1000个回合
        # loss_list = []
        for episode in range(episodes):
            agent.learn()  # 完成一个episode,将每个step的trainsition放入经验池
            # loss_list.append(agent.loss)
            print('Episode: ', episode, '| reward: ', agent.scores[episode])
        avg_score = np.mean(agent.scores)  # 900个episodes的平均分
        print('Round: ', i, '| Average score: ', int(avg_score))
        # print('该回合loss值:',loss_list)  # 观察网络是否在学习,loss值是否变小
        round_all_score += avg_score
        agent.env.close()
    print('run ', round_count, 'rounds,the score is: ', int(round_all_score / round_count))

可能有点小问题,欢迎指正_

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值