1.paddle.to_tensor
通过已知的data去创建一个tensor
paddle.to_tensor(data, dtype=None)
- data可以是scalar,list,numpy.ndarry等
- dtype (str, optional) -创建tensor的数据类型,可以是 ‘bool’ ,‘float16’,‘float32’, ‘float64’ 等
2.这是一个动态图
因为它使用的是paddle.nn.Layer:在官方API中就有声明它是基于OOD实现的动态图Layer。
3.前向网络
parl文件夹一定是在工程文件的根目录下,不然会报错
obs_dim是输入数据的维度,act_dim是输出数据的维度。以Cartpole环境为例,obs_dim是4列,如[0.1,0.3,0.4,0.2]这类。act_dim是0/1,两种。
class CartpoleModel(parl.Model):
def __init__(self, obs_dim, act_dim):
super(CartpoleModel, self).__init__()
self.fc1 = nn.Linear(obs_dim, 100)
self.fc2 = nn.Linear(100, act_dim)
##########################
#初始化Model
输入数据维度为obs_dim;
隐藏神经元数量为100;
输出维度为act_dim
##########################
def forward(self, x):
out = paddle.tanh(self.fc1(x))
prob = F.softmax(self.fc2(out))
return prob
#########################
#定义前向网络x为网络的输入
首先是self.fc1(x)——全连接层
然后用tanh函数激活得out
在使用self.fc2(out)——全连接层
再使用softmax函数激活
最后得输出!!!!!:两个动作的概率。举个例子:0.11 0.89这种
#########################
4.Agent——算法与环境的接口
cartpoleAgent是我们自己定义的智能体,而Parl.agent是必须要继承的
- 解释__init__():“实例化这个CartpoleAgent,首先就是初始化函数init,也就是说实例化的时候就需要把algorithm传入。
- 解释sample():目的是随机探索,最后用的是一个np.random.choice(len(prob), 1 ,p=prob)[0],把动作0/1选出来。
- 解释predict():用在模型评价中,直接选择最优策略。
- 解释learn():运用算法更新前向网络。
class CartpoleAgent(parl.Agent):
def __init__(self, algorithm):
super(CartpoleAgent, self).__init__(algorithm)
def sample(self, obs):
obs = paddle.to_tensor(obs, dtype='float32')
prob = self.alg.predict(obs)
prob = prob.numpy()
act = np.random.choice(len(prob),1,p=prob)[0]
return act
def predict(self,obs):
obs = paddle.to_tensor(obs,dtype='float32')
prob = self.alg.predict(obs)
act = prob.argmax().numpy()[0]
return act
def learn(self, obs, act, reward):
#####act_list是[1,???]的list,
#####通过np.expand_dims转换为[???,1]的list,
#####然后再通过paddle.to_tensor转换为tensor形式,
#####这样才能满足self.alg.learn对数据的要求,进而才能优化网络参数
act = np.expand_dims(act, axis=-1)
reward = np.expand_dims(reward, axis=-1)
obs = paddle.to_tensor(obs, dtype='float32')
act = paddle.to_tensor(act, dtype='int32')
reward = paddle.to_tensor(reward, dtype='float32')
loss = self.alg.learn(obs, act, reward)
return loss.numpy()[0]
关于act的几种输出:act_list;act = np.expand_dims(act, axis=-1);
paddle.to_tensor(act, dtype=‘int32’)
[1 1 1 0 1 0 1 0 0 1 0 1 1]
[[1]
[1]
[1]
[0]
[1]
[0]
[1]
[0]
[0]
[1]
[0]
[1]
[1]]
Tensor(shape=[13, 1], dtype=int32, place=CPUPlace, stop_gradient=True,
[[1],
[1],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1]])
关于loss的几种形式:
learn中的loss
Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=False,
[14.64161110])
loss.numpy()
[14.641611]
loss.numpy()[0]
14.641611
实现cartpole代码解析,代码来自paddle官方:
import gym
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import parl
from parl.utils import logger
class CartpoleModel(parl.Model):
def __init__(self, obs_dim, act_dim):
super(CartpoleModel, self).__init__()
self.fc1 = nn.Linear(obs_dim, 100)
self.fc2 = nn.Linear(100, act_dim)
def forward(self, x):
out = paddle.tanh(self.fc1(x))
prob = F.softmax(self.fc2(out))
return prob
class CartpoleAgent(parl.Agent):
def __init__(self, algorithm):
super(CartpoleAgent, self).__init__(algorithm)
def sample(self, obs):
obs = paddle.to_tensor(obs, dtype='float32')
prob = self.alg.predict(obs)
prob = prob.numpy()
act = np.random.choice(len(prob),1,p=prob)[0]
return act
def predict(self,obs):
obs = paddle.to_tensor(obs,dtype='float32')
prob = self.alg.predict(obs)
act = prob.argmax().numpy()[0]
return act
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
reward = np.expand_dims(reward, axis=-1)
obs = paddle.to_tensor(obs, dtype='float32')
act = paddle.to_tensor(act, dtype='int32')
reward = paddle.to_tensor(reward, dtype='float32')
loss = self.alg.learn(obs, act, reward)
return loss.numpy()[0]
def run_train_episode(agent,env):
obs_list, action_list, reward_list = [],[],[]
obs = env.reset()
while True:
obs_list.append(obs)
action = agent.sample(obs)
action_list.append(action)
obs, reward, done, info = env.step(action)
reward_list.append(reward)
if done:
break
return obs_list, action_list, reward_list
# evaluate 5 episodes
def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):
eval_reward = []
for i in range(eval_episodes):
obs = env.reset()
episode_reward = 0
while True:
action = agent.predict(obs)
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if isOver:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
def calc_reward_to_go(reward_list, gamma=1.0):
for i in range(len(reward_list) - 2, -1, -1):
# G_i = r_i + γ·G_i+1
reward_list[i] += gamma * reward_list[i + 1] # Gt
return np.array(reward_list)
def main():
env = gym.make('CartPole-v0')
# env = env.unwrapped # Cancel the minimum score limit
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# build an agent
model = CartpoleModel(obs_dim=obs_dim, act_dim=act_dim)
alg = parl.algorithms.PolicyGradient(model, lr=1e-3)
agent = CartpoleAgent(alg)
# load model and evaluate
# if os.path.exists('./model.ckpt'):
# agent.restore('./model.ckpt')
# run_evaluate_episodes(agent, env, render=True)
# exit()
for i in range(1000):
obs_list, action_list, reward_list = run_train_episode(agent, env)
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
print(batch_obs.shape)
print(batch_action)
print(batch_reward.shape)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
total_reward = run_evaluate_episodes(agent, env, render=False)
logger.info('Test reward: {}'.format(total_reward))
# save the parameters to ./model.ckpt
agent.save('./model.ckpt')
if __name__ == '__main__':
main()