多智能体游戏环境PettingZoo_CleanRL实施PPO_代码详解

传说中的龙卷风

已于 2023-03-24 14:40:38 修改

阅读量1k

点赞数

文章标签：游戏深度学习 python

于 2023-03-23 23:01:55 首次发布

原文链接：https://pettingzoo.farama.org/tutorials/cleanrl/implementing_PPO/

版权

文章介绍了如何在CleanRL框架下使用PyTorch训练PPO算法，并添加了保存模型的代码，以便于后续的测试和可视化。训练代码（cleanRL.py）处理Pistonball环境，而测试代码（t_cleanRL.py）用于加载和评估训练好的模型。整个过程涉及环境模拟、智能体决策和模型保存与加载机制。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

原始代码链接
我在原始代码里加了保存模型的代码，以便训练结束后可以随时拿出来测试、可视化训练的成果（智能体的表现）。
训练代码写在cleanRL.py，测试代码（我额外写的）写在t_cleanRL.py，这两个文件要位于同一个文件夹中。

cleanRL.py用于训练：

''''''
"""Basic code which shows what it's like to run PPO on the Pistonball env using the parallel API, this code is inspired by CleanRL.

This code is exceedingly basic, with no logging or weights saving.
The intention was for users to have a (relatively clean) ~200 line file to refer to when they want to design their own learning algorithm.

Author: Jet (https://github.com/jjshoots)
基本代码展示了使用并行API在Pistonball env上运行PPO是怎样的。该代码的灵感来自CleanRL。
这段代码非常基本，没有任何日志记录或权重保存。
其目的是让用户在设计自己的学习算法时有一个（相对干净的）~200行的文件可以参考。
作者：Jet(https://github.com/jjshoots)
"""

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from supersuit import color_reduction_v0, frame_stack_v1, resize_v1
from torch.distributions.categorical import Categorical

from pettingzoo.butterfly import pistonball_v6

import pickle  # 我想保存模型

class Agent(nn.Module):
    def __init__(self, num_actions):
        super().__init__()

        self.network = nn.Sequential(
            self._layer_init(nn.Conv2d(4, 32, 3, padding=1)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            self._layer_init(nn.Conv2d(32, 64, 3, padding=1)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            self._layer_init(nn.Conv2d(64, 128, 3, padding=1)),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Flatten(),
            self._layer_init(nn.Linear(128 * 8 * 8, 512)),
            nn.ReLU(),
        )
        self.actor = self._layer_init(nn.Linear(512, num_actions), std=0.01)
        self.critic = self._layer_init(nn.Linear(512, 1))

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

    def get_value(self, x):
        return self.critic(self.network(x / 255.0))

    def get_action_and_value(self, x, action=None):
        hidden = self.network(x / 255.0)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden)


def batchify_obs(obs, device):
    """Converts PZ style observations to batch of torch arrays."""
    '''将PZ型观测值转换为一批torch数组。（PZ指的是PettingZoo）'''
    # convert to list of np arrays
    obs = np.stack([obs[a] for a in obs], axis=0)
    # transpose to be (batch, channel, height, width)
    obs = obs.transpose(0, -1, 1, 2)
    # convert to torch
    obs = torch.tensor(obs).to(device)

    return obs


def batchify(x, device):
    """Converts PZ style returns to batch of torch arrays."""
    '''将PZ型转换为一批torch数组。（PZ指的是PettingZoo）'''
    # convert to list of np arrays
    x = np.stack([x[a] for a in x], axis=0)
    # convert to torch
    x = torch.tensor(x).to(device)

    return x


def unbatchify(x, env):
    """Converts np array to PZ style arguments."""
    '''将np数组转换为PZ型参数'''
    x = x.cpu().numpy()
    x = {a: x[i] for i, a in enumerate(env.possible_agents)}

    return x


if __name__ == "__main__":
    """ALGO PARAMS"""
    '''算法参数（ALGO是algorithm的简写）'''
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ent_coef = 0.1
    vf_coef = 0.1
    clip_coef = 0.1
    gamma = 0.99
    batch_size = 32
    stack_size = 4
    frame_size = (64, 64)
    max_cycles = 125
    total_episodes = 2  # 训练的局数，在这里修改。

    """ ENV SETUP """
    '''创建环境'''
    env = pistonball_v6.parallel_env(
        render_mode="rgb_array", continuous=False, max_cycles=max_cycles
    )
    env = color_reduction_v0(env)
    env = resize_v1(env, frame_size[0], frame_size[1])
    env = frame_stack_v1(env, stack_size=stack_size)
    num_agents = len(env.possible_agents)
    num_actions = env.action_space(env.possible_agents[0]).n
    observation_size = env.observation_space(env.possible_agents[0]).shape

    """ LEARNER SETUP """
    '''创建学习器'''
    agent = Agent(num_actions=num_actions).to(device)

    optimizer = optim.Adam(agent.parameters(), lr=0.001, eps=1e-5)

    """ ALGO LOGIC: EPISODE STORAGE"""
    '''算法逻辑：回合存储'''
    end_step = 0
    total_episodic_return = 0
    rb_obs = torch.zeros((max_cycles, num_agents, stack_size, *frame_size)).to(device)
    rb_actions = torch.zeros((max_cycles, num_agents)).to(device)
    rb_logprobs = torch.zeros((max_cycles, num_agents)).to(device)
    rb_rewards = torch.zeros((max_cycles, num_agents)).to(device)
    rb_terms = torch.zeros((max_cycles, num_agents)).to(device)
    rb_values = torch.zeros((max_cycles, num_agents)).to(device)

    """ TRAINING LOGIC """
    '''训练逻辑'''
    # train for n number of episodes
    # 训练n个回合
    for episode in range(total_episodes):
        # collect an episode
        # 收集一个回合
        with torch.no_grad():
            # collect observations and convert to batch of torch tensors
            # 收集观测值，并转化为一批torch张量
            next_obs = env.reset(seed=None)  # 在这里对环境重置
            # reset the episodic return
            # 重置回合返回
            total_episodic_return = 0

            # each episode has num_steps
            # 每个回合有num_steps（根据下方代码，应该是max_cycles）个时间步
            for step in range(0, max_cycles):
                # rollover the observation
                # 转换观测值
                obs = batchify_obs(next_obs, device)  # 将PZ型观测值转换为一批torch数组。

                # get action from the agent
                # 从智能体中获取动作
                actions, logprobs, _, values = agent.get_action_and_value(obs)

                # execute the environment and log data
                # 执行环境和log数据
                next_obs, rewards, terms, truncs, infos = env.step(
                    unbatchify(actions, env)
                )
                # unbatchify()将np数组转换为PZ型参数

                # add to episode storage
                # 添加到回合存储
                rb_obs[step] = obs
                rb_rewards[step] = batchify(rewards, device)
                rb_terms[step] = batchify(terms, device)
                rb_actions[step] = actions
                rb_logprobs[step] = logprobs
                rb_values[step] = values.flatten()

                # compute episodic return
                # 计算回合回报
                total_episodic_return += rb_rewards[step].cpu().numpy()

                # if we reach termination or truncation, end
                # 如果我们达到了终止或截断状态，那就结束。
                if any([terms[a] for a in terms]) or any([truncs[a] for a in truncs]):
                    end_step = step
                    break

        # bootstrap value if not done
        # 如果没有完成，则自举值。
        # 根据其他估算值来更新估算值的思想，我们称其为自举（bootstrap）。【摘抄自《EasyRL》】
        with torch.no_grad():
            rb_advantages = torch.zeros_like(rb_rewards).to(device)
            for t in reversed(range(end_step)):
                delta = (
                    rb_rewards[t]
                    + gamma * rb_values[t + 1] * rb_terms[t + 1]
                    - rb_values[t]
                )
                rb_advantages[t] = delta + gamma * gamma * rb_advantages[t + 1]
            rb_returns = rb_advantages + rb_values

        # convert our episodes to batch of individual transitions
        # 将我们的回合转换为一批个体过渡
        # ？？我没太懂。
        b_obs = torch.flatten(rb_obs[:end_step], start_dim=0, end_dim=1)
        b_logprobs = torch.flatten(rb_logprobs[:end_step], start_dim=0, end_dim=1)
        b_actions = torch.flatten(rb_actions[:end_step], start_dim=0, end_dim=1)
        b_returns = torch.flatten(rb_returns[:end_step], start_dim=0, end_dim=1)
        b_values = torch.flatten(rb_values[:end_step], start_dim=0, end_dim=1)
        b_advantages = torch.flatten(rb_advantages[:end_step], start_dim=0, end_dim=1)

        # Optimizing the policy and value network
        # 优化策略和价值网络
        b_index = np.arange(len(b_obs))
        clip_fracs = []
        for repeat in range(3):
            # shuffle the indices we use to access the data
            # 打乱我们用来访问数据的索引
            np.random.shuffle(b_index)
            for start in range(0, len(b_obs), batch_size):
                # select the indices we want to train on
                # 选择我们想要训练的索引
                end = start + batch_size
                batch_index = b_index[start:end]

                _, newlogprob, entropy, value = agent.get_action_and_value(
                    b_obs[batch_index], b_actions.long()[batch_index]
                )
                logratio = newlogprob - b_logprobs[batch_index]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    # 计算近似KL散度
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clip_fracs += [
                        ((ratio - 1.0).abs() > clip_coef).float().mean().item()
                    ]

                # normalize advantaegs
                # 规范化优势
                advantages = b_advantages[batch_index]
                advantages = (advantages - advantages.mean()) / (
                    advantages.std() + 1e-8
                )

                # Policy loss
                # 策略损失
                pg_loss1 = -b_advantages[batch_index] * ratio
                pg_loss2 = -b_advantages[batch_index] * torch.clamp(
                    ratio, 1 - clip_coef, 1 + clip_coef
                )
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # Value loss
                # 价值损失
                value = value.flatten()
                v_loss_unclipped = (value - b_returns[batch_index]) ** 2
                v_clipped = b_values[batch_index] + torch.clamp(
                    value - b_values[batch_index],
                    -clip_coef,
                    clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        print(f"Training episode {episode}")
        print(f"Episodic Return: {np.mean(total_episodic_return)}")
        print(f"Episode Length: {end_step}") # 该回合的长度，即该回合的时间步的数量
        print("")
        print(f"Value Loss: {v_loss.item()}")
        print(f"Policy Loss: {pg_loss.item()}")
        print(f"Old Approx KL: {old_approx_kl.item()}")
        print(f"Approx KL: {approx_kl.item()}")
        print(f"Clip Fraction: {np.mean(clip_fracs)}")
        print(f"Explained Variance: {explained_var.item()}")
        print("\n-------------------------------------------\n")



    # 我自己加上的：保存agent。训练花了那么多时间，当然要保存成果啦。
    with open('./model/cleanRL_agent2.pickle', 'wb') as f: # 需要自己事先手动创建model文件夹，不然会报错。
        pickle.dump(agent, f)
        f.close()


    """ RENDER THE POLICY """
    '''渲染策略'''
    env = pistonball_v6.parallel_env(render_mode="human", continuous=False)
    env = color_reduction_v0(env)
    env = resize_v1(env, 64, 64)
    env = frame_stack_v1(env, stack_size=4)


    agent.eval()  # 将模块设置为评估模式。

    with torch.no_grad():
        # render 5 episodes out
        for episode in range(1):
            obs = batchify_obs(env.reset(seed=None), device)
            terms = [False]
            truncs = [False]
            while not any(terms) and not any(truncs):
                actions, logprobs, _, values = agent.get_action_and_value(obs)
                obs, rewards, terms, truncs, infos = env.step(unbatchify(actions, env))
                obs = batchify_obs(obs, device)
                terms = [terms[a] for a in terms]
                truncs = [truncs[a] for a in truncs]

t_cleanRL.py用于测试：

#测试cleanRL的训练成果
import torch
import pickle
from pettingzoo.butterfly import pistonball_v6
from supersuit import color_reduction_v0, frame_stack_v1, resize_v1
from cleanRL import batchify_obs,unbatchify,Agent

# 加载agent
f = open('./model/cleanRL_agent2.pickle', 'rb')
agent = pickle.load(f)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent.eval()  # 将模块设置为评估模式。

with torch.no_grad():
    # render 5 episodes out
    # 渲染5局
    for episode in range(5):
        env = pistonball_v6.parallel_env(render_mode="human", continuous=False)
        env = color_reduction_v0(env)
        env = resize_v1(env, 64, 64)
        env = frame_stack_v1(env, stack_size=4)
        num_agents = len(env.possible_agents)
        total_return = 0  # 记录所有智能体总的奖励
        obs = batchify_obs(env.reset(seed=None), device)
        terms = [False]
        truncs = [False]
        while not any(terms) and not any(truncs):
            actions, logprobs, _, values = agent.get_action_and_value(obs)
            obs, rewards, terms, truncs, infos = env.step(unbatchify(actions, env))
            obs = batchify_obs(obs, device)
            terms = [terms[a] for a in terms]
            truncs = [truncs[a] for a in truncs]
            for key in rewards: # 把所有智能体的奖励全加上
                total_return+=rewards[key]
        # 每局结束后，打印：
        total_return/=num_agents
        print("第"+str(episode)+"局，所有智能体的平均回报为："+str(total_return))
        env.close()