dgl源码阅读笔记(3)——DeepWalk
图神经网络开源库dgl阅读笔记
前言
博客内容为个人阅读dgl代码笔记,仅供个人使用,还有许多不足需要发现改正。
一、DeepWalk简单回顾
通过对每一个采样点进行随机游走,生成多条具有相同长度的游走序列,通过Skipgram模型对游走序列进行训练学习,可以让每个节点的embedding得到梯度下降的更新。在这其中还会用到负采样技术。
二、浏览代码类
1.class DeepwalkTrainer
初始化函数如下:
–norm 是否在embedding后进行归一化
–dim embedding的维度 默认128
–window_size 窗口大小 默认5
–num_walks 每个节点随机走的次数 默认是10
–negative 是否负采样
–batch_size 默认是128
–walk_length 默认80
–neg_weight 负采样的权重
–lr 学习率
–true_neg False的时候代表不要求负采样精度很高,使用faster_sample。如果是True,就要保证负采样里没有FN数据,而生成TN的数据进行负采样。
–fast_neg 和上面的true_neg 是互斥的,True的时候代表精度可以不用很高,采用fast_neg方法
def __init__(self, args):
""" Initializing the trainer with the input arguments """
self.args = args
self.dataset = DeepwalkDataset(
net_file=args.data_file,
map_file=args.map_file,
walk_length=args.walk_length,
window_size=args.window_size,
num_walks=args.num_walks,
batch_size=args.batch_size,
negative=args.negative,
gpus=args.gpus,
fast_neg=args.fast_neg,
ogbl_name=args.ogbl_name,
load_from_ogbl=args.load_from_ogbl,
)
self.emb_size = self.dataset.G.number_of_nodes()# 初始化one-hot向量
self.emb_model = None
还有以下运行需要用到的函数:
判断在什么设备商运行,并且重要的是在这个函数里初始化了SkipGramModel:
def init_device_emb(self):
决定启动fast_train_mp或者fast_train:
def train(self):
启动多设备下的训练:
def fast_train_mp(self):
fast_train_mp的子进程
def fast_train_sp(self):
单设备下的训练:(本次运行涉及到)
def fast_train(self):
2.class SkipGramModel.init()
这里的参数每一个都先存到self的同名变量中
""" Negative sampling based skip-gram """
def __init__(self,
emb_size,
emb_dimension,
walk_length,
window_size,
batch_size,
only_cpu,
only_gpu,
mix,
neg_weight,
negative,
lr,
lap_norm,
fast_neg,
record_loss,
norm,
use_context_weight,
async_update,
num_threads,
):
接下来首先是对u和v的embedding进行随机初始化
并把embedding限制在initrange(1/128)的正态分布内
再用0填充weight中的空值
# initialize the device as cpu
self.device = torch.device("cpu")
# content embedding
self.u_embeddings = nn.Embedding(
self.emb_size, self.emb_dimension, sparse=True)
# context embedding
self.v_embeddings = nn.Embedding(
self.emb_size, self.emb_dimension, sparse=True)
# initialze embedding
initrange = 1.0 / self.emb_dimension
init.uniform_(self.u_embeddings.weight.data, -initrange, initrange) # 把embedding限制在initrange(1/128)的正态分布内
init.constant_(self.v_embeddings.weight.data, 0) # 用0填充weight中的空值,虽然我怀疑是否用得上
初始化lookup_table,打散成-6到6区间,用于生成logsigmoid_table ,并把首尾赋值为0和1(sigmoid两端)
# lookup_table is used for fast sigmoid computing
self.lookup_table = torch.sigmoid(torch.arange(-6.01, 6.01, 0.01))
self.lookup_table[0] = 0.
self.lookup_table[-1] = 1.
if self.record_loss:
self.logsigmoid_table = torch.log(torch.sigmoid(torch.arange(-6.01, 6.01, 0.01)))
self.loss = []
下一步是对正负样本的节点序列选取。
# indexes to select positive/negative node pairs from batch_walks
self.index_emb_posu, self.index_emb_posv = init_emb2pos_index(
self.walk_length,
self.window_size,
self.batch_size)
self.index_emb_negu, self.index_emb_negv = init_emb2neg_index(
self.walk_length,
self.window_size,
self.negative,
self.batch_size)
正负节点样本选取调用的两个函数,我以正样本为例,进入了下面的函数
def init_emb2pos_index
该函数返回出正样本,即存在的边对应的u,v向量
按照定义来说,在同一个batch下的窗口内的两个节点就是正采样的节点,所以按照此三重循环,可以取出所有满足的节点对。b和j用于控制采样是在同一个batch下,满足条件的i和j对应的就是u,v两个节点。
def init_emb2pos_index(walk_length, window_size, batch_size):
idx_list_u = []
idx_list_v = []
for b in range(batch_size):
for i in range(walk_length):
for j in range(i-window_size, i):
if j >= 0:
idx_list_u.append(j + b * walk_length)
idx_list_v.append(i + b * walk_length)
for j in range(i + 1, i + 1 + window_size):
if j < walk_length:
idx_list_u.append(j + b * walk_length)
idx_list_v.append(i + b * walk_length)
# [num_pos * batch_size]
index_emb_posu = torch.LongTensor(idx_list_u)
index_emb_posv = torch.LongTensor(idx_list_v)
return index_emb_posu, index_emb_posv
def init_emb2neg_index
对于负采样进行比较
同样是输出u,v,三重循环中,只是选取了同样数量的u,乘上negtive的倍数。
因为是fast_sample,所以对v的选取不是很准确,而是生成这样大小的列表list(range(batch_size * walk_length)) * negative * window_size * 2 ,这样肯定会导致超出所需。
下一步idx_list_v = idx_list_v[:len(idx_list_u)] 把v列表控制在u列表的大小中。
def init_emb2neg_index(walk_length, window_size, negative, batch_size):
idx_list_u = []
for b in range(batch_size):
for i in range(walk_length):
for j in range(i-window_size, i):
if j >= 0:
idx_list_u += [i + b * walk_length] * negative
for j in range(i+1, i+1+window_size):
if j < walk_length:
idx_list_u += [i + b * walk_length] * negative
idx_list_v = list(range(batch_size * walk_length))\
* negative * window_size * 2
random.shuffle(idx_list_v)
idx_list_v = idx_list_v[:len(idx_list_u)]
# [bs * walk_length * negative]
index_emb_negu = torch.LongTensor(idx_list_u)
index_emb_negv = torch.LongTensor(idx_list_v)
return index_emb_negu, index_emb_negv
class SkipGramModel的其他函数
然后回到SkipGram.__init__的下一步,决定是否初始化上下文权重矩阵
初始化优化器、梯度
if self.use_context_weight:
self.context_weight = init_weight(
self.walk_length,
self.window_size,
self.batch_size)
# adam
self.state_sum_u = torch.zeros(self.emb_size)
self.state_sum_v = torch.zeros(self.emb_size)
# gradients of nodes in batch_walks
self.grad_u, self.grad_v = init_empty_grad(
self.emb_dimension,
self.walk_length,
self.batch_size)
还需要注意到以下函数:
设置异步更新子进程:
def create_async_update(self):
终止异步线程更新:
def finish_async_update(self):
多线程共享参数
def share_memory(self):
决定设备参数,并把lookup_table,grad等移到上面
def set_device(self, gpu_id):
把参数移到gpu上
def all_to_device(self, gpu_id):
用预处理的lookup_table,返回sigmoid和fast_logsigmoid值
def fast_sigmoid(self, score):
idx = torch.floor((score + 6.01) / 0.01).long()
return self.lookup_table[idx]
def fast_logsigmoid(self, score):
idx = torch.floor((score + 6.01) / 0.01).long()
return self.logsigmoid_table[idx]
前向传播:
def forward(self, pos_u, pos_v, neg_v):
存下训练后的节点embedding,为了以后的使用:
def save_embedding(self, dataset, file_name):
def save_embedding_pt(self, dataset, file_name):
def save_embedding_pt_dgl_graph(self, dataset, file_name):
def save_embedding_txt(self, dataset, file_name):
进行随机游走序列的提取:
def fast_learn(self, batch_walks, neg_nodes=None):
接下来展开讲解这个比较重要的fastlearn函数:
def fast_learn
以下是函数说明。
不用前向传播自动计算梯度,用lookup_table计算实现sigmoid
并用u_embedding[i] += (label - score) * v_embedding[j]的方式更新特征向量
其中的batch_walks就是传入的walks,也就是128*80(batch有128个点,路径是80步)的矩阵,下图也是对该函数的一些说明,比较好理解
def fast_learn(self, batch_walks, neg_nodes=None):
"""
Learn a batch of random walks in a fast way. It has the following features:
1. It calculating the gradients directly without the forward operation.
2. It does sigmoid by a looking up table.
Specifically, for each positive/negative node pair (i,j), the updating procedure is as following:
score = self.fast_sigmoid(u_embedding[i].dot(v_embedding[j]))
# label = 1 for positive samples; label = 0 for negative samples.
u_embedding[i] += (label - score) * v_embedding[j]
v_embedding[i] += (label - score) * u_embedding[j]
Parameters
----------
batch_walks list : a list of node sequences
lr float : current learning rate
neg_nodes torch.LongTensor : a long tensor of sampled true negative nodes. If neg_nodes is None,
then do negative sampling randomly from the nodes in batch_walks as an alternative.
Usage example
-------------
batch_walks = [torch.LongTensor([1,2,3,4]),
torch.LongTensor([2,3,4,2])])
lr = 0.01
neg_nodes = None
"""
首先要明确的是,batch_walks(128,80)是batch_num和walklength,所以是所有出现在当前batch中的所有walk的节点集合
接下来规定了学习率,存储当前序列到nodes中
Self.u_embeddings和self.v_embeddings是利用torch.nn.embedding随机生成的矩阵,目前还没有进行训练,特别的是用的不是one-hot向量,而是随机生成的,且维度为(1138499, 128)整个数据集的维度乘特征向量维度
emb_u,emb_v对应的是将128*80的nodes放入,且对应节点编号寻找对应的128长度的特征向量,这样每一行对应一个节点的初始向量。
lr = self.lr
# [batch_size, walk_length]
if isinstance(batch_walks, list):
nodes = torch.stack(batch_walks)
elif isinstance(batch_walks, torch.LongTensor):
nodes = batch_walks
if self.only_gpu:
nodes = nodes.to(self.device)
if neg_nodes is not None:
neg_nodes = neg_nodes.to(self.device)
emb_u = self.u_embeddings(nodes).view(-1, self.emb_dimension).to(self.device)
emb_v = self.v_embeddings(nodes).view(-1, self.emb_dimension).to(self.device)
接下来开始进行正采样样本的训练和更新过程:
首先判断batch是否是最后一个batch,然后将正采样的节点对(u,v)序列存到下面的数据结构中
Emb_pos_u就是从emb_u中,以第0维度为索引,查找所有在index_emb_posu中的节点,这时就有98560的维度
98560 = 128 * (800 - 2 * (5 + 5 * 4 / 2))
## Postive
bs = len(batch_walks)
if bs < self.batch_size:
index_emb_posu, index_emb_posv = init_emb2pos_index(
self.walk_length,
self.window_size,
bs)
index_emb_posu = index_emb_posu.to(self.device)
index_emb_posv = index_emb_posv.to(self.device)
else:
index_emb_posu = self.index_emb_posu
index_emb_posv = self.index_emb_posv
# num_pos: the number of positive node pairs generated by a single walk sequence
# [batch_size * num_pos, dim]
emb_pos_u = torch.index_select(emb_u, 0, index_emb_posu)
emb_pos_v = torch.index_select(emb_v, 0, index_emb_posv)
接下来开始计算每一对节点特征向量相乘,因为是正采样,需要达到1为最好,score被定义为如下的模式
pos_score = torch.sum(torch.mul(emb_pos_u, emb_pos_v), dim=1) # torch.mul是对两个结构一样的矩阵,每一个对应位置上的数字相乘,所以不改变维度,然后再按列求和,这样列维度就变成了1
pos_score = torch.clamp(pos_score, max=6, min=-6)# 最大值和最小值被限制在了[-6,6]维度中,不是归一化,而是超过范围的值都赋值为边界值,因为超过这个范围进入sigmoid的值相差不会有多少,防止过平滑
# [batch_size * num_pos, 1]
score = (1 - self.fast_sigmoid(pos_score)).unsqueeze(1) # unsqueeze用于增加维度,本来是行向量的score,变成了列向量
if self.record_loss:
self.loss.append(torch.mean(self.fast_logsigmoid(pos_score)).item())
接下来计算正采样梯度,可能会考虑到拉普拉斯归一化
# [batch_size * num_pos, dim]
if self.lap_norm > 0:
grad_u_pos = score * emb_pos_v + self.lap_norm * (emb_pos_v - emb_pos_u)
grad_v_pos = score * emb_pos_u + self.lap_norm * (emb_pos_u - emb_pos_v)
else:
grad_u_pos = score * emb_pos_v
grad_v_pos = score * emb_pos_u
if self.use_context_weight:
if bs < self.batch_size:
context_weight = init_weight(
self.walk_length,
self.window_size,
bs).to(self.device)
else:
context_weight = self.context_weight
grad_u_pos *= context_weight
grad_v_pos *= context_weight
然后初始化self.grad,并对grad_u和grad_v进行更新
利用到了这个函数:index_add_
对于x.index_add_(0, index, source) 0是对应的维度(行),x是要更新的矩阵,index对应每次更新x的哪一行,source从上往下遍历的矩阵,每次从最上面取一行加到Index对应的x行中。
# [batch_size * walk_length, dim]
if bs < self.batch_size:
grad_u, grad_v = init_empty_grad(
self.emb_dimension,
self.walk_length,
bs)
grad_u = grad_u.to(self.device)
grad_v = grad_v.to(self.device)
else:
self.grad_u = self.grad_u.to(self.device)
self.grad_u.zero_()
self.grad_v = self.grad_v.to(self.device)
self.grad_v.zero_()
grad_u = self.grad_u
grad_v = self.grad_v
grad_u.index_add_(0, index_emb_posu, grad_u_pos)
grad_v.index_add_(0, index_emb_posv, grad_v_pos)
下面是负采样更新梯度,大同小异
## Negative
if bs < self.batch_size:
index_emb_negu, index_emb_negv = init_emb2neg_index(
self.walk_length, self.window_size, self.negative, bs)
index_emb_negu = index_emb_negu.to(self.device)
index_emb_negv = index_emb_negv.to(self.device)
else:
index_emb_negu = self.index_emb_negu
index_emb_negv = self.index_emb_negv
emb_neg_u = torch.index_select(emb_u, 0, index_emb_negu)
if neg_nodes is None:
emb_neg_v = torch.index_select(emb_v, 0, index_emb_negv)
else:
emb_neg_v = self.v_embeddings.weight[neg_nodes].to(self.device)
# [batch_size * walk_length * negative, dim]
neg_score = torch.sum(torch.mul(emb_neg_u, emb_neg_v), dim=1)
neg_score = torch.clamp(neg_score, max=6, min=-6)
# [batch_size * walk_length * negative, 1]
score = - self.fast_sigmoid(neg_score).unsqueeze(1)
if self.record_loss:
self.loss.append(self.negative * self.neg_weight * torch.mean(self.fast_logsigmoid(-neg_score)).item())
grad_u_neg = self.neg_weight * score * emb_neg_v
grad_v_neg = self.neg_weight * score * emb_neg_u
grad_u.index_add_(0, index_emb_negu, grad_u_neg)
if neg_nodes is None:
grad_v.index_add_(0, index_emb_negv, grad_v_neg)
接下来是更新阶段:
在这里将刚才计算出的梯度放到adam优化器中更新grad
## Update
nodes = nodes.view(-1) # -1代表的是没有考虑到的剩下的维度数量,比如这里就是将128*80的矩阵变成了10240的向量
# use adam optimizer
grad_u = adam(grad_u, self.state_sum_u, nodes, lr, self.device, self.only_gpu)
grad_v = adam(grad_v, self.state_sum_v, nodes, lr, self.device, self.only_gpu)
if neg_nodes is not None:
grad_v_neg = adam(grad_v_neg, self.state_sum_v, neg_nodes, lr, self.device, self.only_gpu)
最后利用grad_u和grad_v将u和v的embedding进行更新,
if not self.async_update:
self.u_embeddings.weight.data.index_add_(0, nodes.view(-1), grad_u)
self.v_embeddings.weight.data.index_add_(0, nodes.view(-1), grad_v)
if neg_nodes is not None:
self.v_embeddings.weight.data.index_add_(0, neg_nodes.view(-1), grad_v_neg)
return
def adam
在计算梯度后,引入了adam优化器,处理后的grad才会给embedding更新用
关于Adam优化器原理可以参考这篇博客:简单认识Adam优化器
下面代码中,std_values 是vt,grad就是mt,比正规的Adam实现简单了一些
def adam(grad, state_sum, nodes, lr, device, only_gpu):
""" calculate gradients according to adam """
grad_sum = (grad * grad).mean(1)
if not only_gpu:
grad_sum = grad_sum.cpu()
state_sum.index_add_(0, nodes, grad_sum) # cpu
std = state_sum[nodes].to(device) # gpu
std_values = std.sqrt_().add_(1e-10).unsqueeze(1)
grad = (lr * grad / std_values) # gpu
3. class DeepwalkSampler
整个类的代码如下:
其实是调用dgl.sampling.random_walk产生随机游走结果
class DeepwalkSampler(object):
def __init__(self, G, seeds, walk_length):
""" random walk sampler
Parameter
---------
G dgl.Graph : the input graph
seeds torch.LongTensor : starting nodes
walk_length int : walk length
"""
self.G = G
self.seeds = seeds
self.walk_length = walk_length
def sample(self, seeds):
walks = dgl.sampling.random_walk(self.G, seeds, length=self.walk_length-1)[0]
return walks
def random_walk
返回值的第一个部分就是根据nodes序列生成的shape(nodes)个随机游走序列
def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob=None,
return_eids=False):
传参给了g, nodes,length,其余为默认值
这里面最重要的是
traces, eids, types = _CAPI_DGLSamplingRandomWalk(gidx, nodes, metapath, p_nd)
traces就是随机游走后的序列
n_etypes = len(g.canonical_etypes)
n_ntypes = len(g.ntypes)
if metapath is None:
if n_etypes > 1 or n_ntypes > 1:
raise DGLError("metapath not specified and the graph is not homogeneous.")
if length is None:
raise ValueError("Please specify either the metapath or the random walk length.")
metapath = [0] * length
else:
metapath = [g.get_etype_id(etype) for etype in metapath]
gidx = g._graph
nodes = F.to_dgl_nd(utils.prepare_tensor(g, nodes, 'nodes'))
metapath = F.to_dgl_nd(utils.prepare_tensor(g, metapath, 'metapath'))
# Load the probability tensor from the edge frames
if prob is None:
p_nd = [nd.array([], ctx=nodes.ctx) for _ in g.canonical_etypes]
else:
p_nd = []
for etype in g.canonical_etypes:
if prob in g.edges[etype].data:
prob_nd = F.to_dgl_nd(g.edges[etype].data[prob])
if prob_nd.ctx != nodes.ctx:
raise ValueError(
'context of seed node array and edges[%s].data[%s] are different' %
(etype, prob))
else:
prob_nd = nd.array([], ctx=nodes.ctx)
p_nd.append(prob_nd)
# Actual random walk
if restart_prob is None:
traces, eids, types = _CAPI_DGLSamplingRandomWalk(gidx, nodes, metapath, p_nd)
elif F.is_tensor(restart_prob):
restart_prob = F.to_dgl_nd(restart_prob)
traces, eids, types = _CAPI_DGLSamplingRandomWalkWithStepwiseRestart(
gidx, nodes, metapath, p_nd, restart_prob)
else:
traces, eids, types = _CAPI_DGLSamplingRandomWalkWithRestart(
gidx, nodes, metapath, p_nd, restart_prob)
traces = F.from_dgl_nd(traces)
types = F.from_dgl_nd(types)
eids = F.from_dgl_nd(eids)
return (traces, eids, types) if return_eids else (traces, types)
三、代码执行流程
1. 主函数启动
也就是DeepwalkTrainer开始启动,传入默认参数,建立出DeepwalkTrainer
然后进入train函数
trainer = DeepwalkTrainer(args)
trainer.train()
def train(self):
""" train the embedding """
if len(self.args.gpus) > 1:
self.fast_train_mp()
else:
self.fast_train()
接着跳转到fast_train中
fast_train
下面的一部分是对相关的参数进行初始化,这里唯一需要注意的num_pos(正采样个数)是考虑到序列前后节点不足window个邻居对应的数量。
def fast_train(self):
""" fast train with dataloader with only gpu / only cpu"""
# the number of postive node pairs of a node sequence
num_pos = 2 * self.args.walk_length * self.args.window_size \
- self.args.window_size * (self.args.window_size + 1) # 后面减去的内容就是在随机游走序列两端的不够window长度的节点所缺少的训练pos数
num_pos = int(num_pos)
self.init_device_emb() # 根据传入的arg参数对device进行赋值
# 和定义SkipGramModel()
if self.args.async_update:
self.emb_model.share_memory()
self.emb_model.create_async_update()
if self.args.count_params:
sum_up_params(self.emb_model)
sampler = self.dataset.create_sampler(0)
dataloader = DataLoader(
dataset=sampler.seeds,
batch_size=self.args.batch_size,
collate_fn=sampler.sample,
shuffle=False,
drop_last=False,
num_workers=self.args.num_sampler_threads,
)
num_batches = len(dataloader)
print("num batchs: %d\n" % num_batches)
需要注意到在 sampler = self.dataset.create_sampler(0),调用了这个函数,接着返回了DeepwalkSampler类。
所以可以在下一行的dataloader = DataLoader()实现产生随机游走序列迭代器的功能。
def create_sampler(self, i):
""" create random walk sampler """
return DeepwalkSampler(self.G, self.seeds[i], self.walk_length)
下面的代码,常规的torch.no_grad(),在此之内的代码不收梯度计算的影响
对dataloader里的每个batch的编号和128*80的向量序列进行遍历
进入到emb_model.fast_learn(walks)中,在这里进行向量的更新过程。
其他的操作就是输入输出的控制,判断之前是否进行过fast负采样,没有的话就进行np.random.choice的负采样,还有控制输出的语句。
这个代码就结束了。
start_all = time.time()
start = time.time()
with torch.no_grad():
max_i = num_batches
for i, walks in enumerate(dataloader):# 对dataloader里的每个batch的编号和128*80的向量序列进行遍历
if self.args.fast_neg:
self.emb_model.fast_learn(walks)
else:
# do negative sampling
bs = len(walks)
neg_nodes = torch.LongTensor(
np.random.choice(self.dataset.neg_table,
bs * num_pos * self.args.negative,
replace=True))
self.emb_model.fast_learn(walks, neg_nodes=neg_nodes)
if i > 0 and i % self.args.print_interval == 0:
if self.args.print_loss:
print("Batch %d training time: %.2fs loss: %.4f" \
% (i, time.time() - start, -sum(self.emb_model.loss) / self.args.print_interval))
self.emb_model.loss = []
else:
print("Batch %d, training time: %.2fs" % (i, time.time() - start))
start = time.time()
if self.args.async_update:
self.emb_model.finish_async_update()
print("Training used time: %.2fs" % (time.time() - start_all))
if self.args.save_in_txt:
self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
elif self.args.save_in_pt:
self.emb_model.save_embedding_pt(self.dataset, self.args.output_emb_file)
else:
self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)