paddle2.0学习(2)——详细解读Cartpole入门教程

1.paddle.to_tensor

通过已知的data去创建一个tensor
paddle.to_tensor(data, dtype=None)

  1. data可以是scalarlistnumpy.ndarry
  2. dtype (str, optional) -创建tensor的数据类型,可以是 ‘bool’ ,‘float16’,‘float32’, ‘float64’ 等

2.这是一个动态图

因为它使用的是paddle.nn.Layer:在官方API中就有声明它是基于OOD实现的动态图Layer。

3.前向网络

parl文件夹一定是在工程文件的根目录下,不然会报错
obs_dim是输入数据的维度,act_dim是输出数据的维度。以Cartpole环境为例,obs_dim是4列,如[0.1,0.3,0.4,0.2]这类。act_dim是0/1,两种。

class CartpoleModel(parl.Model):

    def __init__(self, obs_dim, act_dim):
        super(CartpoleModel, self).__init__()
        self.fc1 = nn.Linear(obs_dim, 100)
        self.fc2 = nn.Linear(100, act_dim)
##########################
#初始化Model
输入数据维度为obs_dim;
隐藏神经元数量为100;
输出维度为act_dim
##########################
    def forward(self, x):
        out = paddle.tanh(self.fc1(x))
        prob = F.softmax(self.fc2(out))
        return prob
#########################
#定义前向网络x为网络的输入
首先是self.fc1(x)——全连接层
然后用tanh函数激活得out
在使用self.fc2(out)——全连接层
再使用softmax函数激活
最后得输出!!!!!:两个动作的概率。举个例子:0.11 0.89这种
#########################

4.Agent——算法与环境的接口

cartpoleAgent是我们自己定义的智能体,而Parl.agent是必须要继承的

  1. 解释__init__():“实例化这个CartpoleAgent,首先就是初始化函数init,也就是说实例化的时候就需要把algorithm传入。
  2. 解释sample():目的是随机探索,最后用的是一个np.random.choice(len(prob), 1 ,p=prob)[0],把动作0/1选出来。
  3. 解释predict():用在模型评价中,直接选择最优策略。
  4. 解释learn():运用算法更新前向网络。
class CartpoleAgent(parl.Agent):

    def __init__(self, algorithm):
        super(CartpoleAgent, self).__init__(algorithm)

    def sample(self, obs):
        obs = paddle.to_tensor(obs, dtype='float32')
        prob = self.alg.predict(obs)
        prob = prob.numpy()
        act = np.random.choice(len(prob),1,p=prob)[0]
        return act

    def predict(self,obs):
        obs = paddle.to_tensor(obs,dtype='float32')
        prob = self.alg.predict(obs)
        act = prob.argmax().numpy()[0]
        return act

    def learn(self, obs, act, reward):
    #####act_list是[1,???]的list,
    #####通过np.expand_dims转换为[???,1]的list,
    #####然后再通过paddle.to_tensor转换为tensor形式,
    #####这样才能满足self.alg.learn对数据的要求,进而才能优化网络参数
        act = np.expand_dims(act, axis=-1)
        reward = np.expand_dims(reward, axis=-1)
        obs = paddle.to_tensor(obs, dtype='float32')
        act = paddle.to_tensor(act, dtype='int32')
        reward = paddle.to_tensor(reward, dtype='float32')

        loss = self.alg.learn(obs, act, reward)

        return loss.numpy()[0]

关于act的几种输出:act_list;act = np.expand_dims(act, axis=-1);
paddle.to_tensor(act, dtype=‘int32’)

[1 1 1 0 1 0 1 0 0 1 0 1 1]
[[1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]]
Tensor(shape=[13, 1], dtype=int32, place=CPUPlace, stop_gradient=True,
       [[1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [1]])

关于loss的几种形式:

learn中的loss
Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=False,
       [14.64161110])
loss.numpy()
[14.641611]
loss.numpy()[0]
14.641611

实现cartpole代码解析,代码来自paddle官方:

import gym
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import parl
from parl.utils import logger

class CartpoleModel(parl.Model):

    def __init__(self, obs_dim, act_dim):
        super(CartpoleModel, self).__init__()
        self.fc1 = nn.Linear(obs_dim, 100)
        self.fc2 = nn.Linear(100, act_dim)

    def forward(self, x):
        out = paddle.tanh(self.fc1(x))
        prob = F.softmax(self.fc2(out))
        return prob

class CartpoleAgent(parl.Agent):

    def __init__(self, algorithm):
        super(CartpoleAgent, self).__init__(algorithm)

    def sample(self, obs):
        obs = paddle.to_tensor(obs, dtype='float32')
        prob = self.alg.predict(obs)
        prob = prob.numpy()
        act = np.random.choice(len(prob),1,p=prob)[0]
        return act

    def predict(self,obs):
        obs = paddle.to_tensor(obs,dtype='float32')
        prob = self.alg.predict(obs)
        act = prob.argmax().numpy()[0]
        return act

    def learn(self, obs, act, reward):
        act = np.expand_dims(act, axis=-1)
        reward = np.expand_dims(reward, axis=-1)
        obs = paddle.to_tensor(obs, dtype='float32')
        act = paddle.to_tensor(act, dtype='int32')
        reward = paddle.to_tensor(reward, dtype='float32')

        loss = self.alg.learn(obs, act, reward)

        return loss.numpy()[0]


def run_train_episode(agent,env):
    obs_list, action_list, reward_list = [],[],[]
    obs = env.reset()
    while True:
        obs_list.append(obs)
        action = agent.sample(obs)
        action_list.append(action)

        obs, reward, done, info = env.step(action)
        reward_list.append(reward)

        if done:
            break
    return obs_list, action_list, reward_list
# evaluate 5 episodes

def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):
    eval_reward = []
    for i in range(eval_episodes):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


def calc_reward_to_go(reward_list, gamma=1.0):
    for i in range(len(reward_list) - 2, -1, -1):
        # G_i = r_i + γ·G_i+1
        reward_list[i] += gamma * reward_list[i + 1]  # Gt
    return np.array(reward_list)


def main():
    env = gym.make('CartPole-v0')
    # env = env.unwrapped # Cancel the minimum score limit
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # build an agent
    model = CartpoleModel(obs_dim=obs_dim, act_dim=act_dim)
    alg = parl.algorithms.PolicyGradient(model, lr=1e-3)
    agent = CartpoleAgent(alg)

    # load model and evaluate
    # if os.path.exists('./model.ckpt'):
    #     agent.restore('./model.ckpt')
    #     run_evaluate_episodes(agent, env, render=True)
    #     exit()

    for i in range(1000):
        obs_list, action_list, reward_list = run_train_episode(agent, env)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)
        print(batch_obs.shape)
        print(batch_action)
        print(batch_reward.shape)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = run_evaluate_episodes(agent, env, render=False)
            logger.info('Test reward: {}'.format(total_reward))

    # save the parameters to ./model.ckpt
    agent.save('./model.ckpt')


if __name__ == '__main__':
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值