目录
1、PyG中数据集的属性查询(Question 1,类别数目和特征数目 Question 2,3)
2、OGB包的使用(Question 4:How many features are in the ogbn-arxiv graph?)
① torch.nn.ModuleList()类、torch.nn.LogSoftmax()类的使用
② PyG中GCNConv()类、BatchNorm()类的使用
④ torch.flatten(input, start_dim=0, end_dim=-1):去掉start~end的维度,把结果包装成一个“一维Tensor”并返回。
⑤ train函数的理解,def train(model, data, train_idx, optimizer, loss_fn):
⑦ test(...)方法中的out.argmax(......)函数使用:如代码注释
1、PyG中数据集的属性查询(Question 1,类别数目和特征数目 Question 2,3)
# Question 1:What is the number of classes and number of features in the ENZYMES dataset?
def get_num_classes(pyg_dataset):
# TODO: Implement a function that takes a PyG dataset object
# and returns the number of classes for that dataset.
num_classes = 0
############# Your code here ############
## (~1 line of code)
## Note
## 1. Colab autocomplete functionality might be useful.
#########################################
num_classes = pyg_dataset.num_classes
return num_classes
def get_num_features(pyg_dataset):
# TODO: Implement a function that takes a PyG dataset object
# and returns the number of features for that dataset.
num_features = 0
############# Your code here ############
## (~1 line of code)
## Note
## 1. Colab autocomplete functionality might be useful.
#########################################
num_features = pyg_dataset.num_features
return num_features
if 'IS_GRADESCOPE_ENV' not in os.environ:
num_classes = get_num_classes(pyg_dataset)
num_features = get_num_features(pyg_dataset)
print("{} dataset has {} classes".format(name, num_classes))
print("{} dataset has {} features".format(name, num_features))
# Question 2:What is the label of the graph with index 100 in the ENZYMES dataset?
def get_graph_class(pyg_dataset, idx):
# TODO: Implement a function that takes a PyG dataset object,
# an index of a graph within the dataset, and returns the class/label
# of the graph (as an integer).
label = -1
############# Your code here ############
## (~1 line of code)
#########################################
label = pyg_dataset[idx].y.item()
return label
# Here pyg_dataset is a dataset for graph classification
if 'IS_GRADESCOPE_ENV' not in os.environ:
graph_0 = pyg_dataset[0]
print(graph_0)
idx = 100
label = get_graph_class(pyg_dataset, idx)
print('Graph with index {} has label {}'.format(idx, label))
# Question 3:How many edges does the graph with index 200 have?
print(pyg_dataset[200].num_edges)
def get_graph_num_edges(pyg_dataset, idx):
# TODO: Implement a function that takes a PyG dataset object,
# the index of a graph in the dataset, and returns the number of
# edges in the graph (as an integer). You should not count an edge
# twice if the graph is undirected. For example, in an undirected
# graph G, if two nodes v and u are connected by an edge, this edge
# should only be counted once.
num_edges = 0
############# Your code here ############
## Note:
## 1. You can't return the data.num_edges directly
## 2. We assume the graph is undirected
## 3. Look at the PyG dataset built in functions
## (~4 lines of code)
#########################################
num_edges = pyg_dataset[idx].num_edges / 2
return num_edges
if 'IS_GRADESCOPE_ENV' not in os.environ:
idx = 200
num_edges = get_graph_num_edges(pyg_dataset, idx)
print('Graph with index {} has {} edges'.format(idx, num_edges))
2、OGB包的使用(Question 4:How many features are in the ogbn-arxiv graph?)
#Question 4
import torch
import pandas as pd
import os
print("aa1")
import torch_geometric.transforms as T
print("aa2")
from ogb.nodeproppred import PygNodePropPredDataset
print("aa3")
if 'IS_GRADESCOPE_ENV' not in os.environ:
dataset_name = 'ogbn-arxiv'
print("bb")
# Load the dataset and transform it to sparse tensor
dataset = PygNodePropPredDataset(name=dataset_name,
transform=T.ToSparseTensor(), root="./Arxiv")
print('The {} dataset has {} graph'.format(dataset_name, len(dataset)))
# Extract the graph
data = dataset[0]
print(data)
#Question 4
def graph_num_features(data):
# TODO: Implement a function that takes a PyG data object,
# and returns the number of features in the graph (as an integer).
num_features = 0
############# Your code here ############
## (~1 line of code)
num_features = data.num_features
#########################################
return num_features
if 'IS_GRADESCOPE_ENV' not in os.environ:
num_features = graph_num_features(data)
print('The graph has {} features'.format(num_features))
3、节点分类任务GNN框架的实现与初步理解
① torch.nn.ModuleList()类、torch.nn.LogSoftmax()类的使用
② PyG中GCNConv()类、BatchNorm()类的使用
③ 此处GNN类的作用/构成理解:
__init__(...)方法中定义上图中的操作,foward(...)方法负责调用这些操作,将图中的数据X、adj等最终映射为一个N维向量(N:节点数)。
④ torch.flatten(input, start_dim=0, end_dim=-1):去掉start~end的维度,把结果包装成一个“一维Tensor”并返回。
举例:一个3维矩阵,压缩0~1维,最终返回一个“1维向量”。
x = np.arange(27)
x = np.reshape(x, (3, 3, 3))
x = torch.from_numpy(x)
print('before flatten', x)
x = torch.flatten(x, start_dim=0, end_dim=1)
print('after flatten', x)
输出如下图:
结果分析:
首先,去掉一个维度后,变量数目是变多的,例如变量为一个3*3的二维矩阵,去掉第0维后,变成3个有三个元素的向量,1到3,增多。
回到本例,压缩/去除第0维,x[0]、x[1]和x[2]之间的联系取消,即最外边的中括号去掉,压缩第1维同理。得到的结果为[1,2,3], [4,5,6]...一个个“变量”,最终把这一个个变量封装成一个“一维向量”便是下图结果所示,实际上是2维的。
⑤ train函数的理解,def train(model, data, train_idx, optimizer, loss_fn):
作用:完成一个epoch的训练(即所有数据走一遍)。
过程:model.train()函数(修改self.training变量的值,使得drop等操作有选择的执行,drop函数定义过程如下所示,需要用到self.training变量)——“优化器”梯度清零——data喂到model里,得到输出结果out——利用loss函数和out以及data标签计算损失——反向传播调整模型参数。
⑥ loss_fn函数:
第二个参数需要为一维向量,不能为二维,内部函数处理时会自动将其上升一个维度,因此要使用torch.flatten(...)函数来降维。
⑦ test(...)方法中的out.argmax(......)函数使用:如代码注释
class GCN(torch.nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
dropout, return_embeds=False):
# TODO: Implement a function that initializes self.convs,
# self.bns, and self.softmax.
super(GCN, self).__init__()
# A list of GCNConv layers
self.convs = None
# A list of 1D batch normalization layers
self.bns = None
# The log softmax layer
self.softmax = None
############# Your code here ############
## Note:
## 1. You should use torch.nn.ModuleList for self.convs and self.bns
## 2. self.convs has num_layers GCNConv layers
## 3. self.bns has num_layers - 1 BatchNorm1d layers
## 4. You should use torch.nn.LogSoftmax for self.softmax
## 5. The parameters you can set for GCNConv include 'in_channels' and
## 'out_channels'. For more information please refer to the documentation:
## https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GCNConv
## 6. The only parameter you need to set for BatchNorm1d is 'num_features'
## For more information please refer to the documentation:
## https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html
## (~10 lines of code)
#########################################
self.convs = torch.nn.ModuleList()
self.bns = torch.nn.ModuleList()
self.softmax = torch.nn.LogSoftmax()
tmp1 = input_dim
tmp2 = hidden_dim
for i in range(num_layers - 1):
self.convs.append(GCNConv(tmp1, tmp2))
self.bns.append(BatchNorm(tmp2))
tmp1 = tmp2
self.convs.append(GCNConv(hidden_dim, output_dim))
# Probability of an element getting zeroed
self.dropout = dropout
# Skip classification layer and return node embeddings
self.return_embeds = return_embeds
def reset_parameters(self):
for conv in self.convs:
conv.reset_parameters()
for bn in self.bns:
bn.reset_parameters()
def forward(self, x, adj_t):
# TODO: Implement a function that takes the feature tensor x and
# edge_index tensor adj_t and returns the output tensor as
# shown in the figure.
out = None
############# Your code here ############
## Note:
## 1. Construct the network as shown in the figure
## 2. torch.nn.functional.relu and torch.nn.functional.dropout are useful
## For more information please refer to the documentation:
## https://pytorch.org/docs/stable/nn.functional.html
## 3. Don't forget to set F.dropout training to self.training
## 4. If return_embeds is True, then skip the last softmax layer
## (~7 lines of code)
#########################################
out = x
for i in range(len(self.bns)):
out = self.convs[i](out, adj_t)
out = self.bns[i](out)
out = torch.nn.functional.relu(out)
out = torch.nn.functional.dropout(out, p=self.dropout, training=self.training)
out = self.convs[-1](out, adj_t)
out = self.softmax(out) if (not self.return_embeds) else out
return out
def train(model, data, train_idx, optimizer, loss_fn):
# TODO: Implement a function that trains the model by
# using the given optimizer and loss_fn.
model.train()
loss = 0
############# Your code here ############
## Note:
## 1. Zero grad the optimizer
## 2. Feed the data into the model
## 3. Slice the model output and label by train_idx
## 4. Feed the sliced output and label to loss_fn
## (~4 lines of code)
optimizer.zero_grad()
out = model(data.x, data.adj_t)
loss = loss_fn(out[train_idx], torch.flatten(data.y[train_idx]))
#########################################
loss.backward()
optimizer.step()
return loss.item()
# Test function here
@torch.no_grad()
def test(model, data, split_idx, evaluator, save_model_results=False):
# TODO: Implement a function that tests the model by
# using the given split_idx and evaluator.
model.eval()
# The output of model on all data
out = None
############# Your code here ############
## (~1 line of code)
## Note:
## 1. No index slicing here
#########################################
out = model(data.x, data.adj_t)
# wjunjie
# argmax在指定维度dim上操作,选择最大值的dim维度上的索引,keepdim=True相当于保留原始维度
# out是一个n*m的矩阵,m等于类别数目.
# 这里最后要得到一个n*1个pred矩阵,所以在dim=1上操作argmax(a[0][0],...,a[0][m-1])
y_pred = out.argmax(dim=-1, keepdim=True)
train_acc = evaluator.eval({
'y_true': data.y[split_idx['train']],
'y_pred': y_pred[split_idx['train']],
})['acc']
valid_acc = evaluator.eval({
'y_true': data.y[split_idx['valid']],
'y_pred': y_pred[split_idx['valid']],
})['acc']
test_acc = evaluator.eval({
'y_true': data.y[split_idx['test']],
'y_pred': y_pred[split_idx['test']],
})['acc']
if save_model_results:
print("Saving Model Predictions")
data = {}
# 二维y_pred转为一维、cuda数据转为cpu、去掉tensor的梯度、转为numpy
data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()
df = pd.DataFrame(data=data)
# Save locally as csv
df.to_csv('ogbn-arxiv_node.csv', sep=',', index=False)
return train_acc, valid_acc, test_acc
# Please do not change the args
if 'IS_GRADESCOPE_ENV' not in os.environ:
args = {
'device': device,
'num_layers': 3,
'hidden_dim': 256,
'dropout': 0.5,
'lr': 0.01,
'epochs': 100,
}
if 'IS_GRADESCOPE_ENV' not in os.environ:
model = GCN(data.num_features, args['hidden_dim'],
dataset.num_classes, args['num_layers'],
args['dropout']).to(device)
evaluator = Evaluator(name='ogbn-arxiv')
import copy
if 'IS_GRADESCOPE_ENV' not in os.environ:
# reset the parameters to initial random value
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = F.nll_loss
best_model = None
best_valid_acc = 0
for epoch in range(1, 1 + args["epochs"]):
loss = train(model, data, train_idx, optimizer, loss_fn)
result = test(model, data, split_idx, evaluator)
train_acc, valid_acc, test_acc = result
if valid_acc > best_valid_acc:
best_valid_acc = valid_acc
best_model = copy.deepcopy(model)
print(f'Epoch: {epoch:02d}, '
f'Loss: {loss:.4f}, '
f'Train: {100 * train_acc:.2f}%, '
f'Valid: {100 * valid_acc:.2f}% '
f'Test: {100 * test_acc:.2f}%')
if 'IS_GRADESCOPE_ENV' not in os.environ:
best_result = test(best_model, data, split_idx, evaluator, save_model_results=True)
train_acc, valid_acc, test_acc = best_result
print(f'Best model: '
f'Train: {100 * train_acc:.2f}%, '
f'Valid: {100 * valid_acc:.2f}% '
f'Test: {100 * test_acc:.2f}%')
4、图级别预测
① 框架构建思路:
一个图级别卷积过程包括:先利用节点GNN得到节点的表征——然后池化——最后进行一些适当的线性or非线性变化。
因此init函数和foward函数的coding过程就按照上述思路来。
② DataLoader与batch:
教程将obg的datset先转换为了DataLoader类型,并设定了batch。难道DataLoader是使用batch的一种方式?
③ train函数中的tips:
A、Batch:一个batch相当于一个小的dataset,里面包含了“32”(batch size)个图的信息。
B、python list解析式:其他语法糖还有dict解析式等...
C、tensor的索引可以是list和LongTensor,还可以bool数组(如train函数中注释所示)。
### GCN to predict graph property
class GCN_Graph(torch.nn.Module):
def __init__(self, hidden_dim, output_dim, num_layers, dropout):
super(GCN_Graph, self).__init__()
# Load encoders for Atoms in molecule graphs
self.node_encoder = AtomEncoder(hidden_dim)
# Node embedding model
# Note that the input_dim and output_dim are set to hidden_dim
self.gnn_node = GCN(hidden_dim, hidden_dim,
hidden_dim, num_layers, dropout, return_embeds=True)
self.pool = None
############# Your code here ############
self.pool = global_mean_pool
## Note:
## 1. Initialize self.pool as a global mean pooling layer
## For more information please refer to the documentation:
## https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#global-pooling-layers
#########################################
# Output layer
self.linear = torch.nn.Linear(hidden_dim, output_dim)
def reset_parameters(self):
self.gnn_node.reset_parameters()
self.linear.reset_parameters()
def forward(self, batched_data):
# TODO: Implement a function that takes as input a
# mini-batch of graphs (torch_geometric.data.Batch) and
# returns the predicted graph property for each graph.
#
# NOTE: Since we are predicting graph level properties,
# your output will be a tensor with dimension equaling
# the number of graphs in the mini-batch
# Extract important attributes of our mini-batch
x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch
embed = self.node_encoder(x)
out = None
############# Your code here ############
## Note:
## 1. Construct node embeddings using existing GCN model
## 2. Use the global pooling layer to aggregate features for each individual graph
## For more information please refer to the documentation:
## https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#global-pooling-layers
## 3. Use a linear layer to predict each graph's property
## (~3 lines of code)
out = self.gnn_node(embed, edge_index)
out = self.pool(out, batch)
out = self.linear(out)
#########################################
return out
def train(model, device, data_loader, optimizer, loss_fn):
# TODO: Implement a function that trains your model by
# using the given optimizer and loss_fn.
model.train()
loss = 0
for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
batch = batch.to(device)
# 一个batch相当于一个小的dataset,里面包含了“32”(batch size)个图的信息
if batch.x.shape[0] == 1 or batch.batch[-1] == 0: # 这个batch是孤立节点或者该batch里只有一个图
pass
else:
## ignore nan targets (unlabeled) when computing training loss.
is_labeled = batch.y == batch.y # 不出意外全是true 【32,1】
############# Your code here ############
## Note:
## 1. Zero grad the optimizer
## 2. Feed the data into the model
## 3. Use `is_labeled` mask to filter output and labels
## 4. You may need to change the type of label to torch.float32
## 5. Feed the output and label to the loss_fn
## (~3 lines of code)
optimizer.zero_grad()
out = model(batch)
# python list解析式,其他语法糖还有dict解析式等...
tmp_index = [index for index in range(is_labeled.shape[0]) if is_labeled[index]]
loss = loss_fn(out[tmp_index], batch.y[tmp_index].type(torch.float32))
# 看了其他人的,这里index直接用is_labeled代替就可以,所以tensor的索引可以是list和LongTensor,还可以bool数组。
#########################################
loss.backward()
optimizer.step()
return loss.item()
# The evaluation function
def eval(model, device, loader, evaluator, save_model_results=False, save_file=None):
model.eval()
y_true = []
y_pred = []
for step, batch in enumerate(tqdm(loader, desc="Iteration")):
batch = batch.to(device)
if batch.x.shape[0] == 1:
pass
else:
with torch.no_grad():
pred = model(batch)
y_true.append(batch.y.view(pred.shape).detach().cpu())
y_pred.append(pred.detach().cpu())
y_true = torch.cat(y_true, dim=0).numpy()
y_pred = torch.cat(y_pred, dim=0).numpy()
input_dict = {"y_true": y_true, "y_pred": y_pred}
if save_model_results:
print("Saving Model Predictions")
# Create a pandas dataframe with a two columns
# y_pred | y_true
data = {}
data['y_pred'] = y_pred.reshape(-1)
data['y_true'] = y_true.reshape(-1)
df = pd.DataFrame(data=data)
# Save to csv
df.to_csv('ogbg-molhiv_graph_' + save_file + '.csv', sep=',', index=False)
return evaluator.eval(input_dict)
if 'IS_GRADESCOPE_ENV' not in os.environ:
model = GCN_Graph(args['hidden_dim'],
dataset.num_tasks, args['num_layers'],
args['dropout']).to(device)
evaluator = Evaluator(name='ogbg-molhiv')
import copy
if 'IS_GRADESCOPE_ENV' not in os.environ:
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.BCEWithLogitsLoss()
best_model = None
best_valid_acc = 0
for epoch in range(1, 1 + args["epochs"]):
print('Training...')
loss = train(model, device, train_loader, optimizer, loss_fn)
print('Evaluating...')
train_result = eval(model, device, train_loader, evaluator)
val_result = eval(model, device, valid_loader, evaluator)
test_result = eval(model, device, test_loader, evaluator)
train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
if valid_acc > best_valid_acc:
best_valid_acc = valid_acc
best_model = copy.deepcopy(model)
print(f'Epoch: {epoch:02d}, '
f'Loss: {loss:.4f}, '
f'Train: {100 * train_acc:.2f}%, '
f'Valid: {100 * valid_acc:.2f}% '
f'Test: {100 * test_acc:.2f}%')
End...