环境:
pip install gym
pip install paddlepaddle==1.6.3
pip install parl==1.3.1
1、神经网络
具体代码
model.py
import parl
from parl import layers
# 预测模型
class Model(parl.Model):
def __init__(self, act_dim):
# 动作维度
act_dim = act_dim
hid1_size = act_dim * 10
# 全链接网络
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=act_dim, act='softmax') # 动作的概率
# 前向传播
def forward(self, obs):
out = self.fc1(obs)
out = self.fc2(out)
# 输出动作的概率
return out
2、策略算法
具体代码
algorithm.py
import paddle.fluid as fluid
import parl
from parl import layers
# 策略算法类
class PolicyGradient(parl.Algorithm):
def __init__(self, model, lr=None):
self.model = model # 预测模型
assert isinstance(lr, float)
self.lr = lr # 学习率
# 预测动作概率
def predict(self, obs):
return self.model(obs)
# 学习更新模型
def learn(self, obs, action, reward):
# 预测概率
act_prob = self.model(obs)
# 交叉熵 预测概率与 实际动作(one_hot)的交叉熵
log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) * layers.one_hot(action, act_prob.shape[1]),
dim=1)
# 损失函数
cost = log_prob * reward
cost = layers.reduce_mean(cost)
# 优化器
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
return cost
3、智能体
具体代码
agent.py
import numpy as np
import paddle.fluid as fluid
import parl
from parl import layers
# 智能体
class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
self.obs_dim = obs_dim # 环境维度
self.act_dim = act_dim # 动作维度
super(Agent, self).__init__(algorithm) # 算法
# 执行图程序
def build_program(self):
self.pred_program = fluid.Program() # 预测计算图
self.learn_program = fluid.Program() # 学习计算图
# 构建预测计算图
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32'
)
# 环境输入与预测处理
self.act_prob = self.alg.predict(obs)
# 构建学习计算图
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32'
)
act = layers.data(name='act', shape=[1], dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
# 环境, 动作,反馈 学习处理
self.cost = self.alg.learn(obs, act, reward)
# 动作采样(有随机概率)
def sample(self, obs):
obs = np.expand_dims(obs, axis=0)
# 执行预测图处理
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob]
)[0]
act_prob = np.squeeze(act_prob, axis=0)
# 根据概率 获取动作(随机策略)
act = np.random.choice(range(self.act_dim), p=act_prob)
return act
# 动作预测
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
# 执行预测图处理
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob]
)[0]
# 获取概率最大的动作(确定性策略)
act = np.argmax(act_prob)
return act
# 学习更新参数
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int64'),
'reward': reward.astype('float32')
}
# 执行学习图处理
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost]
)[0]
# 返回损失函数
return cost
4、训练
具体代码
train.py
import gym
import numpy as np
from agent import Agent
from algorithm import PolicyGradient
from model import Model
# 执行一个轮回
def run_episode(env, agent):
# 记录所有 环境、动作、反馈
obs_list, action_list, reward_list = [], [], []
obs = env.reset() # 重置环境
while True:
obs_list.append(obs) # 保存环境
action = agent.sample(obs) # 抽样动作
action_list.append(action) # 保存动作
# 执行一步
obs, reward, done, info = env.step(action)
reward_list.append(reward) # 保存反馈
if done: # 轮回结束
break
return obs_list, action_list, reward_list
# 评估算法
def evaluate(env, agent, render=False):
# 记录所有反馈
eval_reward = []
for i in range(5):
obs = env.reset() # 重置环境
episode_reward = 0 # 单个轮回反馈
while True:
action = agent.predict(obs) # 预测动作
# 执行一步
obs, reward, done, _ = env.step(action)
# 累加反馈值
episode_reward += reward
if render: # 绘制运行界面
env.render()
if done: # 结束
break
# 保存 一个轮回 累计反馈
eval_reward.append(episode_reward)
# 返回 5次的均值反馈
return np.mean(eval_reward)
# 计算反馈的累计预测回报
def calc_reward_to_go(reward_list, gamma=1.0):
for i in range(len(reward_list) - 2, -1, -1):
reward_list[i] += gamma * reward_list[i + 1]
return np.array(reward_list)
def main():
# 学习率
LEARNING_RATE = 1e-3
# 初始化环境
env = gym.make('CartPole-v0')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
print('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 初始化模型
model = Model(act_dim=act_dim)
# 创建策略算法
alg = PolicyGradient(model, lr=LEARNING_RATE)
# 创建智能体
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)
# 训练1000次
for i in range(1000):
# 运行一个轮回 获取数据
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
# 打印 反馈均值
print('Episode {}, Reward Sum {}'.format(i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
# 学习更新参数
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
# 测试反馈均值
total_reward = evaluate(env, agent, render=True)
print('Test reward:{}'.format(total_reward))
# 保存智能体
agent.save('./model.ckpt')
env.close()
if __name__ == '__main__':
main()
5、预测
具体代码
predict.py
import os
import gym
import numpy as np
from agent import Agent
from algorithm import PolicyGradient
from model import Model
# 评估算法
def evaluate(env, agent, render=False):
# 记录所有反馈
eval_reward = []
for i in range(5):
obs = env.reset() # 重置环境
episode_reward = 0 # 单个轮回反馈
while True:
action = agent.predict(obs) # 预测动作
# 执行一步
obs, reward, done, _ = env.step(action)
# 累加反馈值
episode_reward += reward
if render: # 绘制运行界面
env.render()
if done: # 结束
break
# 保存 一个轮回 累计反馈
eval_reward.append(episode_reward)
# 返回 5次的均值反馈
return np.mean(eval_reward)
def calc_reward_to_go(reward_list, gamma=1.0):
for i in range(len(reward_list) - 2, -1, -1):
reward_list[i] += gamma * reward_list[i + 1]
return np.array(reward_list)
def main():
# 学习率
LEARNING_RATE = 1e-3
# 初始化环境
env = gym.make('CartPole-v0')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
print('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 初始化模型
model = Model(act_dim=act_dim)
# 创建策略算法
alg = PolicyGradient(model, lr=LEARNING_RATE)
# 创建智能体
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)
# 加载模型
if os.path.exists('./model.ckpt'):
agent.restore('./model.ckpt')
# 测试智能体
for i in range(1):
total_reward = evaluate(env, agent, render=True)
print('Test reward:{}'.format(total_reward))
env.close()
if __name__ == '__main__':
main()