1.定义智能体数量
agent_num = 10 # 15 20 25 30
2.创建环境和初始化中央网络
env=ToyEnv()
class CentralValueNetwork(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(CentralValueNetwork, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.fc3 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
# 初始化网络
input_dim = 100 # 全局状态的维度
hidden_dim = 64 # 隐藏层的维度
output_dim = 1 # 输出维度,即价值
cvn = CentralValueNetwork(input_dim, hidden_dim, output_dim)
# 初始化权重和偏置
for m in cvn.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
# 设置优化器
optimizer = optim.Adam(cvn.parameters(), lr=0.001)
3.创建智能体
agents = []
for n in range(agent_num):
agents.append(PPO(state_dim,action_dim,lr_actor = config.lr,lr_critic= config.lr*5,gamma=config.gamma,
K_epochs= config.epochs,eps_clip= config.eps_clip,has_continuous_action_space = True,
action_std_init=config.action_std_init))
4.重置环境获得初始状态
states = [envs.reset()[i] for i in range(agent_num) ]
5.一个智能体对应一个状态,对于每个(智能体,状态)
for agent, state in zip(agents, states):
5.1选择动作
action = agent.select_action(env.NormalizedPreprocess(state))
5.2执行动作
state_, reward_original, p_grid, c1_, c2_, c3_ = env.step(a_normal, p_limit_t)
5.3保存数据
states_.append(state_)
actions.append(action)
reward_originals.append(reward_original)
6.利用中央网络更新每个智能体
for agent in agents:
agent.update(cvn)