GNN学习笔记
GNN从入门到精通课程笔记
2.1 LINE (Code-Implement)
LINE: Large-scale Information Network Embedding (WWW ‘15)
# 转载自:https://github.com/dsj96/LINE-Large-Scale-Information-Network-Embedding-Python
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from decimal import *
import numpy as np
import collections
from tqdm import tqdm,trange
from sklearn import cluster
# LINE模型实现
# 继承自nn.Module
class LINEModel(nn.Module):
def __init__(self, size, embed_dim=128, order=1):
super(LINEModel, self).__init__()
assert order in [1, 2], print("Order should either be int(1) or int(2)")
# 设置embedding的维度
self.embed_dim = embed_dim
# 论文中的1st order, 2nd order
self.order = order
# nodes数*embedding维度
self.nodes_embeddings = nn.Embedding(size, embed_dim)
# 初始化模型参数
# 只有1st order时每个node只需要一个embedding
# 当有2nd order时每个node还需要一个context embedding(邻居),共计两个
if order == 2:
self.contextnodes_embeddings = nn.Embedding(size, embed_dim)
# uniform的Initialization
self.contextnodes_embeddings.weight.data = self.contextnodes_embeddings.weight.data.uniform_(-.5, .5) / embed_dim
# uniform的Initialization
self.nodes_embeddings.weight.data = self.nodes_embeddings.weight.data.uniform_(-.5, .5) / embed_dim
def forward(self, v_i, v_j, negsamples, device):
v_i = self.nodes_embeddings(v_i).to(device)
# 这里是1阶2阶相似度计算的区别,2阶是用上下文contextnodes_embeddings
# 1阶用的是nodes_embeddings
if self.order == 2:
v_j = self.contextnodes_embeddings(v_j).to(device)
negativenodes = -self.contextnodes_embeddings(negsamples).to(device)
else:
v_j = self.nodes_embeddings(v_j).to(device)
negativenodes = -self.nodes_embeddings(negsamples).to(device)
# 公式(7)中的第一项(正样本计算),第一步是点乘,然后是按行求和
mulpositivebatch = torch.mul(v_i, v_j)
positivebatch = F.logsigmoid(torch.sum(mulpositivebatch, dim=1))
# 公式(7)中的第二项(负样本计算)
mulnegativebatch = torch.mul(v_i.view(len(v_i), 1, self.embed_dim), negativenodes)
negativebatch = torch.sum(F.logsigmoid(torch.sum(mulnegativebatch, dim=2)), dim=1)
# 公式(7)
loss = positivebatch + negativebatch
return -torch.mean(loss)
# 别名采样类
class VoseAlias(object):
"""
构建alias table,达到O(1)的采样效率
Adding a few modifs to https://github.com/asmith26/Vose-Alias-Method
"""
def __init__(self, dist):
"""
初始化函数
(VoseAlias, dict) -> NoneType
"""
self.dist = dist
self.alias_initialisation()
def alias_initialisation(self):
"""
Construct probability and alias tables for the distribution.
"""
# Initialise variables
n = len(self.dist)
# 概率表
self.table_prob = {} # probability table
# 替身表
self.table_alias = {} # alias table
# 乘以n的概率表
scaled_prob = {} # scaled probabilities
# 存储概率值小于1的
small = [] # stack for probabilities smaller that 1
# 存储概率值大于1的
large = [] # stack for probabilities greater than or equal to 1
# Construct and sort the scaled probabilities into their appropriate stacks
# 将各个概率分成两组,一组的概率值大于1,另一组的概率值小于1
print("1/2. Building and sorting scaled probabilities for alias table...")
for o, p in tqdm(self.dist.items()):
scaled_prob[o] = Decimal(p) * n
if scaled_prob[o] < 1:
small.append(o)
else:
large.append(o)
print("2/2. Building alias table...")
# Construct the probability and alias tables
# 使用贪心算法,将概率值小于1的列表不断填成1
while small and large:
s = small.pop()
l = large.pop()
self.table_prob[s] = scaled_prob[s]
self.table_alias[s] = l
# 更新概率值
scaled_prob[l] = (scaled_prob[l] + scaled_prob[s]) - Decimal(1)
if scaled_prob[l] < 1:
small.append(l)
else:
large.append(l)
# The remaining outcomes (of one stack) must have probability 1
# 当两方不全有元素时,仅有一方有元素的也全为1
# 就是最后一个large列表中的那个元素应该为1
while large:
self.table_prob[large.pop()] = Decimal(1)
while small:
self.table_prob[small.pop()] = Decimal(1)
self.listprobs = list(self.table_prob)
def alias_generation(self):
"""
Yields a random outcome from the distribution.
"""
# Determine which column of table_prob to inspect
col = random.choice(self.listprobs)
# Determine which outcome to pick in that column
# 取自己
if self.table_prob[col] >= random.uniform(0, 1):
return col
# 取替身
else:
return self.table_alias[col]
def sample_n(self, size):
"""
调用alias_generation一共n次,采样n个nodes
Yields a sample of size n from the distribution, and print the results to stdout.
"""
for i in range(size):
yield self.alias_generation()
# 读图函数
def makeDist(graphpath, power=0.75):
# 初始化词典
edgedistdict = collections.defaultdict(int)
nodedistdict = collections.defaultdict(int)
weightsdict = collections.defaultdict(int)
nodedegrees = collections.defaultdict(int)
# 用来做归一化的两个sum变量
weightsum = 0
negprobsum = 0
# 统计图一共有多少条边
nlines = 0
with open(graphpath, "r") as graphfile:
for l in graphfile:
nlines += 1
print("Reading edgelist file...")
maxindex = 0
with open(graphpath, "r") as graphfile:
# #用qdm展示for循环进度百分比
for l in tqdm(graphfile, total=nlines):
# 将\n换行符去掉,并按空格分词,存储格式为:点i,点j,weight
line = [int(i) for i in l.replace("\n", "").split(" ")]
node1, node2, weight = line[0], line[1], line[2]
# 后面会做归一化,存的是归一化的边-权重和点-出度
edgedistdict[tuple([node1, node2])] = weight
nodedistdict[node1] += weight
# 不再做处理,存的是边-权重,点-出度
weightsdict[tuple([node1, node2])] = weight
nodedegrees[node1] += weight
# weightsum存的是全图所有边的边权和,论文公式(2)中用到的1st相似度真实值
weightsum += weight
negprobsum += np.power(weight, power)
# maxindex记录图中最大顶点index
if node1 > maxindex:
maxindex = node1
elif node2 > maxindex:
maxindex = node2
for node, outdegree in nodedistdict.items():
nodedistdict[node] = np.power(outdegree, power) / negprobsum
for edge, weight in edgedistdict.items():
edgedistdict[edge] = weight / weightsum
# edgedistdict边且归一化
# nodedistdict点且归一化
# weightsdict边的权重
# nodedegrees点的出度
# maxindex最大节点index
return edgedistdict, nodedistdict, weightsdict, nodedegrees, maxindex
# 负采样函数(为一个节点产生k个负样本)
def negSampleBatch(sourcenode, targetnode, negsamplesize, weights, nodedegrees, nodesaliassampler, t=10e-3):
"""
For generating negative samples.
"""
negsamples = 0
while negsamples < negsamplesize: # negsamplesize我们设置的负样本是5个点,取够5个才停止
# nodesaliassampler是实现alias building的VoseAlias类,这里采样点
samplednode = nodesaliassampler.sample_n(1)
# 如果采样出source或target均跳过
if (samplednode == sourcenode) or (samplednode == targetnode):
continue
# 辅出负样本点,一共negsamplesize个点
else:
negsamples += 1
yield samplednode
def makeData(samplededges, negsamplesize, weights, nodedegrees, nodesaliassampler):
for e in samplededges: # 遍历samplededges
sourcenode, targetnode = e[0], e[1] # 起点和终点
negnodes = []
# 采样出negsamplesize(5)个负样本点
for negsample in negSampleBatch(sourcenode, targetnode, negsamplesize, weights, nodedegrees, nodesaliassampler):
# 将所有的负样本点加入到negnodes列表中
for node in negsample:
negnodes.append(node)
# 格式是(node i,node j,negative nodes..…)总共7个点,前面两个正样本边的点i和j,后面5个是负样本
yield [e[0], e[1]] + negnodes
# 1.设置模型参数
# 2.读图,存点和边并做归一化3.计算点和边的alias table
# 4.Line模型实现
# 5.模型按边训练以及负采样
# 6.结果展示和可视化
if __name__ == "__main__":
# 1. # 设置模型参数;读图,存点和边并做归一化
# 1)设置模型参数设置模型超参数,如1st order,2nd order,负样本数量(K),embedding维度,batch、epoch、learning rate等
# 2)输入输出
# 输入文件./data/weighted.karate.edgelist
# 输出文件./model.pt
parser = argparse.ArgumentParser()
# 输入文件
parser.add_argument("-g", "--graph_path", type=str, default='dataset/weighted.karate.edgelist')
# Hyperparams.超参数
# 论文中的1st order,2nd order
parser.add_argument("-order", "--order", type=int, default=2)
# 负样本个数
parser.add_argument("-neg", "--negsamplesize", type=int, default=5)
# embedding维度
parser.add_argument("-dim", "--dimension", type=int, default=128)
# batchsize大小
parser.add_argument("-batchsize", "--batchsize", type=int, default=5)
parser.add_argument("-epochs", "--epochs", type=int, default=1)
# 学习率
parser.add_argument("-lr", "--learning_rate", type=float, default=0.025) # As starting value in paper
# 负采样指数值
parser.add_argument("-negpow", "--negativepower", type=float, default=0.75)
args = parser.parse_args()
# 2.读图,存点和边并做归一化
# 1)读图自己实现的makeDist函数,在utils.py中
# Create dict of distribution when opening file
# 读图,函数在utils.py中
edgedistdict, nodedistdict, weights, nodedegrees, maxindex = makeDist(args.graph_path, args.negativepower)
# 3. 计算点和边的alias table
# 构建alias table,达到O(1)的采样效率
edgesaliassampler = VoseAlias(edgedistdict)
nodesaliassampler = VoseAlias(nodedistdict)
# LINE模型实现
# 每次训练batch size大小的边数量
batchrange = int(len(edgedistdict) / args.batchsize)
print('maxindex = ', maxindex)
line = LINEModel(maxindex + 1, embed_dim=args.dimension, order=args.order)
# SGD优化,nesterov是对momentum的改进,是在momentum向量的终端再更新梯度。
opt = optim.SGD(line.parameters(), lr=args.learning_rate, momentum=0.9, nesterov=True)
# 选用gpu或cpu训练
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lossdata = {"it": [], "loss": []}
it = 0
print("\nTraining on {}...\n".format(device))
# 共训练epoch次数
for epoch in range(args.epochs):
print("Epoch {}".format(epoch))
# 每次训练组数:batchsize
for b in trange(batchrange):
# edgesaliassampler是实现alias building的VoseAlias类,这里采样出batchsize条边
samplededges = edgesaliassampler.sample_n(args.batchsize)
# 存makeData是utils.py中的函数,为每条边采样出K条负样本边存每一条格式是(node i,node j,negative nodes...)
batch = list(makeData(samplededges, args.negsamplesize, weights, nodedegrees, nodesaliassampler))
# 转换成tensor格式
batch = torch.LongTensor(batch)
# 把一个batch的数据打印出来是这样:
# tensor([[3, 8 14, 14, 24, 2, 32],
# [25, 32, 14, 9, 4, 24, 23],
# [1, 14, 32, 1, 25, 27, 16],
# [26, 32, 30, 4, 14, 7, 4],
# [25, 32, 25, 14, 20, 14, 27]])
# 取第0列就是起始点
v_i = batch[:, 0]
# 取第1列就是终点
v_j = batch[:, 1]
# 取后面5列就是负样本
negsamples = batch[:, 2:]
# 在做BP之前将gradients置因为是梯度累加的
line.zero_grad()
# Line模型实现部分
loss = line(v_i, v_j, negsamples, device)
# 计算梯度
loss.backward()
# 根据梯度值更新参数值
opt.step()
lossdata["loss"].append(loss.item())
lossdata["it"].append(it)
it += 1
# k-means
embedding_node = []
for i in range(len(edgedistdict)):
i = torch.LongTensor([i])
t = line.nodes_embeddings(i)
embedding_node.append(t.tolist()[0])
embedding_node = np.matrix(embedding_node).reshape((len(edgedistdict), -1))
y_pred = cluster.KMeans(n_clusters=3, random_state=9).fit_predict(embedding_node)
print('y_pred = ', y_pred)