TensorFlow Agents强化学习实战手册:手把手教你训练智能体玩转平衡杆游戏
前言
作为TensorFlow框架生态中的重要成员,TensorFlow Agents为开发者提供了一个高效构建强化学习模型的工具箱。今天我们就以经典的CartPole平衡杆游戏为例,手把手带您体验用这个框架开发智能体的完整流程。就算您是刚接触强化学习的新手,跟着本文的步骤也能快速上手。
一、环境搭建与工具准备
1.1 安装核心依赖库
!pip install tf-agents[reverb] # 核心框架
!pip install gym[classic_control] # 游戏环境库
!pip install tensorboard # 训练过程可视化
1.2 导入必要模块
import abc
import numpy as np
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.networks import sequential
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import time_step as ts
from tf_agents.specs import array_spec
from tf_agents.utils import common
二、创建游戏环境
2.1 定义环境类
class CartPoleEnv(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=(4,), dtype=np.float32, minimum=-10.0, maximum=10.0, name='observation')
self._state = np.zeros(4, dtype=np.float32)
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = np.random.uniform(-0.05, 0.05, size=4)
self._episode_ended = False
return ts.restart(self._state)
三、构建智能体大脑
3.1 配置神经网络
def create_q_network(input_tensor_spec, output_tensor_spec):
return sequential.Sequential([
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(output_tensor_spec.shape.num_elements())
], input_spec=input_tensor_spec)
3.2 实例化智能体
env = CartPoleEnv()
tf_env = tf_py_environment.TFPyEnvironment(env)
q_net = create_q_network(tf_env.observation_spec(), tf_env.action_spec())
agent = dqn_agent.DqnAgent(
tf_env.time_step_spec(),
tf_env.action_spec(),
q_network=q_net,
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
epsilon_greedy=0.1
)
四、训练流程全解析
4.1 配置训练参数
num_iterations = 2000
collect_steps_per_iteration = 1
batch_size = 64
replay_buffer_capacity = 1000
4.2 初始化回放缓冲区
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=tf_env.batch_size,
max_length=replay_buffer_capacity
)
4.3 训练主循环
for iteration in range(num_iterations):
# 收集训练数据
time_step = tf_env.current_time_step()
action_step = agent.collect_policy.action(time_step)
next_time_step = tf_env.step(action_step.action)
traj = trajectory.from_transition(time_step, action_step, next_time_step)
replay_buffer.add_batch(traj)
# 采样训练
experience = replay_buffer.get_next(batch_size)
train_loss = agent.train(experience).loss
# 每100步打印训练进度
if iteration % 100 == 0:
print(f'第{iteration}步,当前损失值:{train_loss.numpy()}')
五、效果评估与模型部署
5.1 智能体表现测试
def evaluate_agent(policy, num_episodes=10):
total_rewards = []
for _ in range(num_episodes):
time_step = tf_env.reset()
episode_reward = 0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = tf_env.step(action_step.action)
episode_reward += time_step.reward.numpy()[0]
total_rewards.append(episode_reward)
return np.mean(total_rewards)
# 执行评估
avg_reward = evaluate_agent(agent.policy)
print(f'平均得分:{avg_reward}')
5.2 模型保存与加载
# 保存模型
save_path = './saved_model/'
tf.saved_model.save(agent, save_path)
# 加载模型
loaded_agent = tf.saved_model.load(save_path)
六、完整实例代码
# 环境准备
!pip install tf-agents[reverb] gym[classic_control] tensorboard
import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import tf_py_environment
from tf_agents.networks import sequential
from tf_agents.replay_buffers import tf_uniform_replay_buffer
# 创建环境
class CartPoleEnv(py_environment.PyEnvironment):
# 此处省略环境类实现,参考上文
tf_env = tf_py_environment.TFPyEnvironment(CartPoleEnv())
# 构建智能体
q_net = sequential.Sequential([
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(2)
])
agent = dqn_agent.DqnAgent(
tf_env.time_step_spec(),
tf_env.action_spec(),
q_network=q_net,
optimizer=tf.keras.optimizers.Adam(0.001)
)
# 训练配置
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
agent.collect_data_spec,
batch_size=1,
max_length=1000
)
# 执行训练
for _ in range(1000):
# 收集数据
time_step = tf_env.reset()
action_step = agent.collect_policy.action(time_step)
next_time_step = tf_env.step(action_step.action)
# 存储经验
replay_buffer.add_next(
time_step,
action_step,
next_time_step
)
# 采样训练
experiences = replay_buffer.gather_all()
agent.train(experiences)
# 效果评估(代码同上文评估部分)
七、常见问题排查
- 训练初期得分不增长怎么办?
- 适当调大探索率epsilon参数
- 检查神经网络结构是否合理
- 增加训练迭代次数
- 出现显存不足错误怎么处理?
- 减小batch_size参数
- 降低神经网络层数
- 使用更简单的环境
- 如何加速训练过程?
- 开启GPU加速
- 增大经验回放缓冲区容量
- 调整学习率为0.0005-0.01之间