【实战指南】TensorFlow Agents强化学习从入门到精通：手把手教你训练智能体玩转平衡杆游戏

本文链接：https://blog.csdn.net/tombosky/article/details/147308115

TensorFlow Agents强化学习实战手册：手把手教你训练智能体玩转平衡杆游戏

前言

作为TensorFlow框架生态中的重要成员，TensorFlow Agents为开发者提供了一个高效构建强化学习模型的工具箱。今天我们就以经典的CartPole平衡杆游戏为例，手把手带您体验用这个框架开发智能体的完整流程。就算您是刚接触强化学习的新手，跟着本文的步骤也能快速上手。

一、环境搭建与工具准备

1.1 安装核心依赖库

!pip install tf-agents[reverb]  # 核心框架
!pip install gym[classic_control]  # 游戏环境库
!pip install tensorboard  # 训练过程可视化

1.2 导入必要模块

import abc
import numpy as np
import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.networks import sequential
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import time_step as ts
from tf_agents.specs import array_spec
from tf_agents.utils import common

二、创建游戏环境

2.1 定义环境类

class CartPoleEnv(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(4,), dtype=np.float32, minimum=-10.0, maximum=10.0, name='observation')
        self._state = np.zeros(4, dtype=np.float32)
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = np.random.uniform(-0.05, 0.05, size=4)
        self._episode_ended = False
        return ts.restart(self._state)

三、构建智能体大脑

3.1 配置神经网络

def create_q_network(input_tensor_spec, output_tensor_spec):
    return sequential.Sequential([
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(output_tensor_spec.shape.num_elements())
    ], input_spec=input_tensor_spec)

3.2 实例化智能体

env = CartPoleEnv()
tf_env = tf_py_environment.TFPyEnvironment(env)

q_net = create_q_network(tf_env.observation_spec(), tf_env.action_spec())

agent = dqn_agent.DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    epsilon_greedy=0.1
)

四、训练流程全解析

4.1 配置训练参数

num_iterations = 2000
collect_steps_per_iteration = 1
batch_size = 64
replay_buffer_capacity = 1000

4.2 初始化回放缓冲区

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity
)

4.3 训练主循环

for iteration in range(num_iterations):
    # 收集训练数据
    time_step = tf_env.current_time_step()
    action_step = agent.collect_policy.action(time_step)
    next_time_step = tf_env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)

    # 采样训练
    experience = replay_buffer.get_next(batch_size)
    train_loss = agent.train(experience).loss

    # 每100步打印训练进度
    if iteration % 100 == 0:
        print(f'第{iteration}步，当前损失值：{train_loss.numpy()}')

五、效果评估与模型部署

5.1 智能体表现测试

def evaluate_agent(policy, num_episodes=10):
    total_rewards = []
    for _ in range(num_episodes):
        time_step = tf_env.reset()
        episode_reward = 0
        
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = tf_env.step(action_step.action)
            episode_reward += time_step.reward.numpy()[0]
        
        total_rewards.append(episode_reward)
    return np.mean(total_rewards)

# 执行评估
avg_reward = evaluate_agent(agent.policy)
print(f'平均得分：{avg_reward}')

5.2 模型保存与加载

# 保存模型
save_path = './saved_model/'
tf.saved_model.save(agent, save_path)

# 加载模型
loaded_agent = tf.saved_model.load(save_path)

六、完整实例代码

# 环境准备
!pip install tf-agents[reverb] gym[classic_control] tensorboard

import tensorflow as tf
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import tf_py_environment
from tf_agents.networks import sequential
from tf_agents.replay_buffers import tf_uniform_replay_buffer

# 创建环境
class CartPoleEnv(py_environment.PyEnvironment):
    # 此处省略环境类实现，参考上文

tf_env = tf_py_environment.TFPyEnvironment(CartPoleEnv())

# 构建智能体
q_net = sequential.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2)
])

agent = dqn_agent.DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=tf.keras.optimizers.Adam(0.001)
)

# 训练配置
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    agent.collect_data_spec,
    batch_size=1,
    max_length=1000
)

# 执行训练
for _ in range(1000):
    # 收集数据
    time_step = tf_env.reset()
    action_step = agent.collect_policy.action(time_step)
    next_time_step = tf_env.step(action_step.action)
    
    # 存储经验
    replay_buffer.add_next(
        time_step,
        action_step,
        next_time_step
    )
    
    # 采样训练
    experiences = replay_buffer.gather_all()
    agent.train(experiences)

# 效果评估（代码同上文评估部分）