batch_size=128,
bins=16, # 保存相似度
bottle_neck_neurons=16, # 瓶颈层神经元
dropout=0.5,
epochs=5,
filters_1=128,
filters_2=64,
filters_3=32,
histogram=False, # 直方图
learning_rate=0.001,
load_path=None,
save_path=None,
tensor_neurons=16, # 张量层神经元
testing_graphs='../dataset/test/',
training_graphs='../dataset/train/',
weight_decay=0.0005
这里路径在命令行里’./dataset/test/‘是没有问题的,但在jupyter里就读不到路径,得改成’../dataset/test/’,有点奇怪
可通过texttable包美化一下输出
+---------------------+------------------+
| Batch size | 128 |
+=====================+==================+
| Bins | 16 |
+---------------------+------------------+
| Bottle neck neurons | 16 |
+---------------------+------------------+
| Dropout | 0.500 |
+---------------------+------------------+
| Epochs | 5 |
+---------------------+------------------+
| Filters 1 | 128 |
+---------------------+------------------+
| Filters 2 | 64 |
+---------------------+------------------+
| Filters 3 | 32 |
+---------------------+------------------+
| Histogram | 0 |
+---------------------+------------------+
| Learning rate | 0.001 |
+---------------------+------------------+
| Load path | None |
+---------------------+------------------+
| Save path | None |
+---------------------+------------------+
| Tensor neurons | 16 |
+---------------------+------------------+
| Testing graphs | ../dataset/test/ |
+---------------------+------------------+
| Training graphs | ../dataset/train/ |
+---------------------+------------------+
| Weight decay | 0.001 |
+---------------------+------------------+
将上述args喂入训练模型SimGNNTrainer
首先初始化
def initial_label_enumeration(self):
"""
Collecting the unique node idsentifiers.
"""
print("\nEnumerating unique labels.\n")
self.training_graphs = glob.glob(self.args.training_graphs + "*.json")
# glob.glob可以找所有匹配的文件路径列表,相当于正则查找
# (路径如果设成'./dataset/test/'得到空数组)
# 50个训练集
self.testing_graphs = glob.glob(self.args.testing_graphs + "*.json")
# 50个测试集
graph_pairs = self.training_graphs + self.testing_graphs
# 共计100个
self.global_labels = set()
for graph_pair in tqdm(graph_pairs):
data = process_pair(graph_pair)
# 将数据集的json文件内容读出
self.global_labels = self.global_labels.union(set(data["labels_1"]))
self.global_labels = self.global_labels.union(set(data["labels_2"]))
self.global_labels = sorted(self.global_labels)
self.global_labels = {val:index for index, val in enumerate(self.global_labels)}
self.number_of_labels = len(self.global_labels)
训练正向:
for epoch in epochs:
batches = self.create_batches()
#根据batchsize给数据集分段得到一个list
self.loss_sum = 0
main_index = 0
for index, batch in tqdm(enumerate(batches), total=len(batches), desc="Batches"):
loss_score = self.process_batch(batch)
main_index = main_index + len(batch)
self.loss_sum = self.loss_sum + loss_score * len(batch)
loss = self.loss_sum/main_index
epochs.set_description("Epoch (Loss=%g)" % round(loss, 5))
其中
def process_batch(self, batch):
self.optimizer.zero_grad()
losses = 0
for graph_pair in batch:
data = process_pair(graph_pair)
data = self.transfer_to_torch(data)
#整合数据
#其中对于属性数据,将一维的属性数据独热编码,
#尺寸=节点数x属性去重后的种类数
#对于我的数据集是189x104
#其中对于相似度,对原始的不规则相似度归一化,再转为e指数
#norm_ged = data["ged"]/(0.5*(len(data["labels_1"])+len(data["labels_2"])))
#new_data["target"] = torch.from_numpy(np.exp(-norm_ged).reshape(1, 1)).view(-1).float()
target = data["target"]
prediction = self.model(data)
losses = losses + torch.nn.functional.mse_loss(data["target"], prediction)
losses.backward(retain_graph=True)
self.optimizer.step()
loss = losses.item()
return loss
其中transfer_to_torch
数据集data如下包括两张图graph_1和graph_2,以及两组标签labels_1和labels_2
将所有标签排序(对字符串格式排序就像下面这样,这不会乱吗)
然后再按key是标签,value是对应索引的格式保存成字典
def calculate_bottleneck_features(self):
"""
Deciding the shape of the bottleneck layer.
"""
if self.args.histogram == True:
self.feature_count = self.args.tensor_neurons + self.args.bins
else:# 默认为FALSE
self.feature_count = self.args.tensor_neurons # =16
进而搭起模型
SimGNN(
(convolution_1): GCNConv(16, 128)
#relu,dropout
(convolution_2): GCNConv(128, 64)
#relu,dropout
(convolution_3): GCNConv(64, 32)
(attention): AttentionModule()
(tensor_network): TenorNetworkModule()
(fully_connected_first): Linear(in_features=16, out_features=16, bias=True)
(scoring_layer): Linear(in_features=16, out_features=1, bias=True)
)
其中注意力层的计算方式没有理解,张量层则是基于下述公式
因为args.load_path为none,所以trainer.fit()
对于每一轮:
首先分批(都只有一批)
对于这一批里的每一个json得到data,再将其torch化
'edge_index_1’相当于是把"graph_1"这样一个50×2的矩阵镜像后拼在原来的下面得到100×2的矩阵,再将其转置得到2×100的矩阵
“features_1"相当于把"labels_1"对应global_labels字典的value(其实就是索引)进行独热编码
最后"target”
norm_ged = data["ged"]/(0.5*(len(data["labels_1"])+len(data["labels_2"])))
new_data["target"] = torch.from_numpy(np.exp(-norm_ged).reshape(1, 1)).view(-1).float()
顺便终于整明白了forward函数相当于一个__call__函数,只要对该类对象赋值,就直接调用__call__函数和forward函数
这时data就是送入SimGNN类的forward
对两张图分别经过三层图卷积,然后再经过注意力池化,最后一起进入张量层,经过一些激活得到结果
于是将data[“target”]与计算结果求误差,这里原作者没有统一维度,一个是torch.Size([1, 1]))一个是torch.Size([1]),报错可能会出广播错误,其实就去层括号的事
然后就是评估,保存了