基于值与基于决策
在强化学习中,有两大类方法,一种基于值(Value-based),一种基于策略(Policy-based)
- Value-based的算法的典型代表为Q-learning和SARSA,将Q函数优化到最优,再根据Q函数取最优策略。
- Policy-based的算法的典型代表为Policy Gradient,直接优化策略函数。
两者的区别一目了然,即一种是根据价值以确定的方案去决策,另一种直接一步到位得出决策的结果,而不是依次计算每个状态的奖励后去逐步做决策的。
随机策略
在确定策略中,神经网络输出的是根据状态计算出的Q值,然后我们以固定的策略根据这个Q值去做决策。
在随机策略中,我们将不同的状态输入,然后直接算出去做某个决策的概率,一步到位直接学习做决策,不再生成Q值。
这样做以后,我们面临一个问题就是该如何优化我们的网络,如何评价网络好坏?
基于策略的模型它无法在一次计算后就对神经网络进行反向传播,因为他的结果是在多次迭代后才生成,因此的对他的优化也有别于普通的神经网络。
轨迹期望回报
既然,我们的最终的目的是决策后获得最大期望,那么我们根据最终的回报和决策轨迹的概率求出期望回报来代表作为一个轨迹的回报,那么根据这个回报的大小,我们就可以对网络进行优化。
于是,我们就有了策略梯度,并以策略梯度作为Loss来优化神经网络
策略梯度有两种采样方式:
- 蒙特卡洛是在每个回合结束后去更新参数(reinforce)
- 时序差分是在每一个step后都做参数更新,它的更新频率更高(Actor-critic)
Reinforce
算法核心
实践为代码
算法流程
代码实践
参照上图
model
class Model(parl.Model):
def __init__(self, act_dim):
self.fc1 = layers.fc(size = 256,act='tanh')
self.fc2 = layers.fc(size = act_dim,act='softmax')#输出个动作的概率
def forward(self, obs): # 可直接用 model = Model(5); model(obs)调用
out = self.fc1(obs)
out = self.fc2(out)
return out
Algorithm
class PolicyGradient(parl.Algorithm):
def __init__(self, model, lr=None):
""" Policy Gradient algorithm
Args:
model (parl.Model): policy的前向网络.
lr (float): 学习率.
"""
self.model = model
assert isinstance(lr, float)
self.lr = lr
def predict(self, obs):
""" 使用policy model预测输出的动作概率
"""
return self.model(obs)
def learn(self, obs, action, reward):
""" 用policy gradient 算法更新policy model
"""
act_prob = self.model(obs) # 获取输出动作概率
# log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
log_prob = layers.reduce_sum(
-1.0 * layers.log(act_prob) * layers.one_hot(
action, act_prob.shape[1]),
dim=1)
cost = log_prob * reward
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
return cost
Agent
class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.act_prob = self.alg.predict(obs)
with fluid.program_guard(
self.learn_program): # 搭建计算图用于 更新policy网络,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
self.cost = self.alg.learn(obs, act, reward)
def sample(self, obs):
obs = np.expand_dims(obs, axis=0) # 增加一维维度
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])[0]
act_prob = np.squeeze(act_prob, axis=0) # 减少一维维度
act = np.random.choice(range(self.act_dim), p=act_prob) # 根据动作概率选取动作
return act
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])[0]
act_prob = np.squeeze(act_prob, axis=0)
act = np.argmax(act_prob) # 根据动作概率选择概率最高的动作
return act
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int64'),
'reward': reward.astype('float32')
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0]
return cost
训练与测试
def run_episode(env, agent):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()
while True:
obs = preprocess(obs) # from shape (210, 160, 3) to (100800,)
obs_list.append(obs)
action = agent.sample(obs) # 采样动作
action_list.append(action)
obs, reward, done, info = env.step(action)
reward_list.append(reward)
if done:
break
return obs_list, action_list, reward_list
# 评估 agent, 跑 5 个episode,求平均
def evaluate(env, agent, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
obs = preprocess(obs) # from shape (210, 160, 3) to (100800,)
action = agent.predict(obs) # 选取最优动作
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if isOver:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
预处理图片输入和计算step的回报
def preprocess(image):
""" 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """
image = image[35:195] # 裁剪
image = image[::2,::2,0] # 下采样,缩放2倍
image[image == 144] = 0 # 擦除背景 (background type 1)
image[image == 109] = 0 # 擦除背景 (background type 2)
image[image != 0] = 1 # 转为灰度图,除了黑色外其他都是白色
return image.astype(np.float).ravel()
# 根据一个episode的每个step的reward列表,计算每一个Step的Gt
def calc_reward_to_go(reward_list, gamma=0.99):
"""calculate discounted reward"""
reward_arr = np.array(reward_list)
for i in range(len(reward_arr) - 2, -1, -1):
# G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
reward_arr[i] += gamma * reward_arr[i + 1]
# normalize episode rewards
reward_arr -= np.mean(reward_arr)
reward_arr /= np.std(reward_arr)
return reward_arr
环境配置与流程控制
# 创建环境
env = gym.make('Pong-v0')
obs_dim = 80 * 80
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 根据parl框架构建agent
model = Model(act_dim=act_dim)
alg = PolicyGradient(model,lr = LEARNING_RATE)
agent = Agent(alg,obs_dim = obs_dim,act_dim =act_dim)
# 加载模型
if os.path.exists('./model.ckpt'):
agent.restore('./model.ckpt')
for i in range(3000):
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
logger.info("Train Episode {}, Reward Sum {}.".format(i,
sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
total_reward = evaluate(env, agent, render=False)
logger.info('Episode {}, Test reward: {}'.format(i + 1,
total_reward))
agent.save('./model.ckpt')
结果展示