强化学习经典算法笔记(八):LSTM加持的A2C算法解决POMDP问题

强化学习经典算法笔记(八):LSTM加持的A2C算法解决POMDP问题

最近用到LSTM构建Agent,找到了一个非常简明易读的示例代码。
https://github.com/HaiyinPiao/pytorch-a2clstm-DRQN

环境采用CartPole-v1。原本状态是一个4维向量,现删去第二维,即小车的速度,保留小车的位移,杆的角度和角速度,使问题从MDP问题变为POMDP(Partial Observable Markov Decision Process)问题。

代码如下:

导入必要package

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import math
import random
import os
import gym

参数设置

STATE_DIM = 4-1;    # 删去小车速度这一维度,使之成为POMDP
ACTION_DIM = 2;     # 动作空间大小
NUM_EPISODE = 5000; # 训练的Episode数量
EPISODE_LEN = 1000; # episode最大长度
A_HIDDEN = 40;      # Actor网络的隐层神经元数量
C_HIDDEN = 40;      # Critic网络的隐层神经元数量

ActorCritic网络

# ActorNet使用LSTM + MLP估计完整的状态
class ActorNetwork(nn.Module):

    def __init__(self,in_size,hidden_size,out_size):
        super(ActorNetwork, self).__init__()
        self.lstm = nn.LSTM(in_size, hidden_size, batch_first = True)
        self.fc = nn.Linear(hidden_size,out_size)

    def forward(self, x, hidden):
        x, hidden = self.lstm(x, hidden)
        x = self.fc(x)
        x = F.log_softmax(x,2)  # log(softmax(x))
        return x, hidden
class ValueNetwork(nn.Module):

    def __init__(self,in_size,hidden_size,out_size):
        super(ValueNetwork, self).__init__()
        self.lstm = nn.LSTM(in_size, hidden_size, batch_first = True)
        self.fc = nn.Linear(hidden_size,out_size)

    def forward(self,x, hidden):
        x, hidden = self.lstm(x, hidden)
        x = self.fc(x)
        return x, hidden

完成单episode交互并记录trajectory

def roll_out(actor_network,env,episode_len,value_network,init_state):
    '''
    rollout最长1000frames
    返回:
    状态序列,不包括终态
    动作序列,独热编码
    奖励序列,不包括终态奖励
    state:游戏环境初始化后的初始状态
    '''
    states = []
    actions = []
    rewards = []
    is_done = False
    final_r = 0
    state = init_state # 初始状态
    a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0); # 初始化隐状态
    a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
    c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
    c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);

    for j in range(episode_len):
        states.append(state)
        log_softmax_action, (a_hx,a_cx) = actor_network(Variable(torch.Tensor([state]).unsqueeze(0)), (a_hx,a_cx))
        # 这个部分可以用torch Categorical来实现
        # from torch.distributions import Categorical
        softmax_action = torch.exp(log_softmax_action) # 对数softmax取指数,保证大于0
        action = np.random.choice(ACTION_DIM,p=softmax_action.cpu().data.numpy()[0][0])
        
        # 动作独热编码
        one_hot_action = [int(k == action) for k in range(ACTION_DIM)]
        
        next_state,reward,done,_ = env.step(action)
        next_state = np.delete(next_state, 1)
        #fix_reward = -10 if done else 1
        
        actions.append(one_hot_action)
        rewards.append(reward)
        final_state = next_state # final_state和state是一回事
        state = next_state
        if done:
            is_done = True
            state = env.reset()
            state = np.delete(state,1)
            a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
            a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
            c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
            c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);

            # 打印episode总分
            print(j+1)
            break
    if not is_done: # 1000frame后如果episode还未结束,就用VNet估计终态价值c_out
        c_out, (c_hx,c_cx) = value_network(Variable(torch.Tensor([final_state])), (c_hx,c_cx))
        final_r = c_out.cpu().data.numpy() # 如果episode正常结束,final_r=0表示终态cart失去控制得0分
    return states,actions,rewards,final_r,state

计算累计折扣奖励的函数

def discount_reward(r, gamma,final_r):
    '''
    r:          list
    final_r:    scalar
    '''
    discounted_r = np.zeros_like(r)
    running_add = final_r
    for t in reversed(range(0, len(r))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

训练和测试主函数

def main():
    # 初始化env
    env = gym.make("CartPole-v1")
    init_state = env.reset()
    init_state = np.delete(init_state,1) # 删掉cart velocity这一维度
    

    # 初始化价值网络
    value_network = ValueNetwork(in_size=STATE_DIM, hidden_size=C_HIDDEN, out_size=1)
    value_network_optim = torch.optim.Adam(value_network.parameters(),lr=0.005)

    # 初始化动作网络
    actor_network = ActorNetwork(in_size=STATE_DIM, hidden_size=A_HIDDEN, out_size=ACTION_DIM)
    actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.001)

    steps =[]
    task_episodes =[]
    test_results =[]

    for episode in range(NUM_EPISODE):
        # 完成一轮rollout
        states,actions,rewards,final_r,current_state = roll_out(actor_network,env,EPISODE_LEN,value_network,init_state)
        # states.shape = [epi_len,3],list
        
        # rollout结束后的初态
        init_state = current_state
        actions_var = Variable(torch.Tensor(actions).view(-1,ACTION_DIM)).unsqueeze(0)
        states_var = Variable(torch.Tensor(states).view(-1,STATE_DIM)).unsqueeze(0)

        # 训练动作网络
        a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
        a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
        c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
        c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
        
        actor_network_optim.zero_grad()
        # print(states_var.unsqueeze(0).size())
        log_softmax_actions, (a_hx,a_cx) = actor_network(states_var, (a_hx,a_cx))
        vs, (c_hx,c_cx) = value_network(states_var, (c_hx,c_cx)) # 给出状态价值估计
        vs.detach()    # 不参与求梯度
        
        # 计算Q(s,a)和Advantage函数
        qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r)))
        qs = qs.view(1, -1, 1)
        advantages = qs - vs
        # print('adv,',advantages.shape)
        # log_softmax_actions * actions_var是利用独热编码特性取出对应action的对数概率
        actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages)
        actor_network_loss.backward()
        torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5)
        actor_network_optim.step()

        # 训练价值网络
        value_network_optim.zero_grad()
        target_values = qs
        a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
        a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
        c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
        c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
        values, (c_hx,c_cx) = value_network(states_var, (c_hx,c_cx))

        criterion = nn.MSELoss()
        value_network_loss = criterion(values,target_values)
        value_network_loss.backward()
        torch.nn.utils.clip_grad_norm(value_network.parameters(),0.5)
        value_network_optim.step()

        # Testing
        if (episode + 1) % 50== 0:
                result = 0
                test_task = gym.make("CartPole-v1")
                for test_epi in range(10):       # 测试10个episode
                    state = test_task.reset()
                    state = np.delete(state,1)
                    
                    a_hx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
                    a_cx = torch.zeros(A_HIDDEN).unsqueeze(0).unsqueeze(0);
                    c_hx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
                    c_cx = torch.zeros(C_HIDDEN).unsqueeze(0).unsqueeze(0);
                    
                    for test_step in range(500): # 每个episode最长500frame
                        
                        log_softmax_actions, (a_hx,a_cx) = actor_network(Variable(torch.Tensor([state]).view(1,1,3)), (a_hx,a_cx))
                        softmax_action = torch.exp(log_softmax_actions)
                        
                        #print(softmax_action.data)
                        action = np.argmax(softmax_action.data.numpy()[0])
                        next_state,reward,done,_ = test_task.step(action)
                        next_state = np.delete(next_state,1)
                        
                        result += reward
                        state = next_state
                        if done:
                            break
                print("episode:",episode+1,"test result:",result/10.0)
                steps.append(episode+1)
                test_results.append(result/10)
    plt.plot(steps,test_results)
    plt.savefig('training_score.png')
if __name__ == '__main__':
    main()

实验结果如下:
在这里插入图片描述

  • 14
    点赞
  • 54
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值