系列文章目录
前言
在 3 图游走算法 中简单介绍了图游走算法的原理和实现,这一章算是图游走算法的一个应用,算是作为一个补充。
PS:日常感谢度娘:[实战二] 图神经网络七日打卡训练营——图游走算法核心代码实现
一、整体框架
通过图随机游走算法,我们得到了每个节点的embedding vector,其中归纳了当前节点与相邻节点的信息,也就相当于NLP领域中对词完成了向量化表达,接下来做的就是下游任务的处理,通常情况下,下游任务一般分为3类,图级别、边级别、节点级别。在此不再赘述。
整体代码的框架可以用如下图表示:
二、代码解读
1.读取数据集
这里使用的数据集依旧是ArXiv
def load(name):
"""
载入相应的数据集
"""
if name == 'BlogCatalog':
dataset = data_loader.BlogCatalogDataset()
elif name == "ArXiv":
dataset = data_loader.ArXivDataset()
else:
raise ValueError(name + " dataset doesn't exists")
return dataset
dataset = load(args.dataset)
这个数据集相关的信息在这里
class ArXivDataset(object):
"""ArXiv dataset implementation
Args:
np_random_seed: The random seed for numpy.
Attributes:
graph: The :code:`Graph` data object.
"""
def __init__(self, np_random_seed=123):
self.path = get_default_data_dir("arXiv")
self.np_random_seed = np_random_seed
self._load_data()
def _load_data(self):
np.random.seed(self.np_random_seed)
edge_path = os.path.join(self.path, 'ca-AstroPh.txt')
bi_edges = set()
self.neg_edges = []
self.pos_edges = []
self.node2id = dict()
# 大致意思就是:node2id这个字典里要是没有这个节点,那就把它加进来,其值是当前node2id中节点的数量?然后返回当前node2id字典中节点的数量
def node_id(node):
if node not in self.node2id:
self.node2id[node] = len(self.node2id)
return self.node2id[node]
with io.open(edge_path) as inf:
for _ in range(4):
inf.readline()
for line in inf:
# 截取一行中的两个节点
u, v = line.strip('\n').split('\t')
# 计算这这俩节点各自的节点数量?
# 这样就保证了对应节点数量少的节点在前面?搞不懂为啥要这么构造
u, v = node_id(u), node_id(v)
if u < v:
bi_edges.add((u, v))
else:
bi_edges.add((v, u))
# 总的节点数目
num_nodes = len(self.node2id)
# 整个while代码的意思是,选一些负样本的节点,
# 也就是没有边相连的节点对,总数目为正样本的一半(有边相连)
while len(self.neg_edges) < len(bi_edges) // 2:
# If num_nodes is int, the random sample is generated as if a were np.arange(num_nodes)
# 随机从range(num_nodes)中选择len(bi_edges)* 2个值
random_edges = np.random.choice(num_nodes, [len(bi_edges), 2])
for (u, v) in random_edges:
if u != v and (u, v) not in bi_edges and (v, u
) not in bi_edges:
self.neg_edges.append((u, v))
if len(self.neg_edges) == len(bi_edges) // 2:
break
bi_edges = list(bi_edges)
np.random.shuffle(bi_edges)
# 随机抽取一般的正样本的边
self.pos_edges = bi_edges[:len(bi_edges) // 2]
# 剩下的节点扩充一下,有些单边变双边
bi_edges = bi_edges[len(bi_edges) // 2:]
all_edges = []
for edge in bi_edges:
u, v = edge
all_edges.append((u, v))
all_edges.append((v, u))
self.graph = graph.Graph(num_nodes=num_nodes, edges=all_edges)
2.划分训练集和测试集
num_edges = len(dataset.pos_edges) + len(dataset.neg_edges)
# 划分参与训练的边和测试的边
train_num_edges = int(len(dataset.pos_edges) * 0.5) + int(
len(dataset.neg_edges) * 0.5)
test_num_edges = num_edges - train_num_edges
3.构建model
3.1 feed 数据
同样的,得先知道数据啥样
train_pyreader.decorate_tensor_provider(
link_predict_generator(
dataset.pos_edges[:train_num_edges // 2],
dataset.neg_edges[:train_num_edges // 2],
batch_size=train_num_edges,
epoch=epoch))
test_pyreader.decorate_tensor_provider(
link_predict_generator(
dataset.pos_edges[train_num_edges // 2:],
dataset.neg_edges[train_num_edges // 2:],
batch_size=test_num_edges,
epoch=1))
数据生成器长这个样子,先打标签,然后wrapper调用batch_edges_generator
def link_predict_generator(pos_edges,
neg_edges,
batch_size=512,
epoch=2000,
shuffle=True):
"""
此函数用于生成边和对应的标签信息
"""
all_edges = []
# 首先给正负样本打上标签,正样本为1,负样本为0,u,v是两个节点的id
for (u, v) in pos_edges:
all_edges.append([u, v, 1])
for (u, v) in neg_edges:
all_edges.append([u, v, 0])
all_edges = np.array(all_edges, np.int64)
# all_edges.shape = [边的总数, 3]
def batch_edges_generator(shuffle=shuffle):
# 为所有边生成各自的id,perm相当于一个边序号的数组
perm = np.arange(len(all_edges), dtype=np.int64)
if shuffle:
np.random.shuffle(perm)
start = 0
# 每次生成batch_size 条边
while start < len(all_edges):
yield all_edges[perm[start:start + batch_size]]
start += batch_size
def wrapper():
for _ in range(epoch):
for batch_edges in batch_edges_generator():
# batch_edges.shape = [batch_size, 3]
# batch_edges.T.shape = [3, batch_size]
# batch_edges.T[0:1].shape = [1, batch_size] 代表第一个节点
# batch_edges.T[0:1].T.shape = [batch_size, 1] 代表第一个节点
# batch_edges.T[0:1].shape = [1, batch_size] 代表第二个节点
# batch_edges.T[1:2].T.shape = [batch_size, 1] 代表第二个节点
# batch_edges.T[2:3].shape = [1, batch_size] 代表边的标签
# batch_edges.T[2:3].T.shape = [batch_size, 1] 代表边的标签
yield batch_edges.T[0:1].T, batch_edges.T[
1:2].T, batch_edges.T[2:3].T
return wrapper
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
train_pyreader, train_loss, train_probs, train_labels = link_predict_model(
dataset.graph.num_nodes, hidden_size=hidden_size, name='train')
lr = l.polynomial_decay(0.025, train_steps, 0.0001)
adam = fluid.optimizer.Adam(lr)
adam.minimize(train_loss)
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, test_loss, test_probs, test_labels = link_predict_model(
dataset.graph.num_nodes, hidden_size=hidden_size, name='test')
test_prog = test_prog.clone(for_test=True)
其中,边预测模型的代码如下
def link_predict_model(num_nodes,
hidden_size=16,
name='link_predict_task',
binary_op_type="Weighted-L2"):
"""
链接预测模型函数
"""
pyreader = l.py_reader(
capacity=70,
# 第一个节点的id,第二个节点的id,边的label值,shape都是[batch_size, 1]
shapes=[[-1, 1], [-1, 1], [-1, 1]],
dtypes=['int64', 'int64', 'int64'],
lod_levels=[0, 0, 0],
name=name + '_pyreader',
use_double_buffer=True)
u, v, label = l.read_file(pyreader)
# u_embed 和 v_embed为我们原先训练好的 DeepWalk 模型或者 node2vec 模型对应的节点 embedding
u_embed = l.embedding(
input=u,
size=[num_nodes, hidden_size],
param_attr=fluid.ParamAttr(name='content'))
v_embed = l.embedding(
input=v,
size=[num_nodes, hidden_size],
param_attr=fluid.ParamAttr(name='content'))
u_embed.stop_gradient = True
v_embed.stop_gradient = True
edge_embed = binary_op(u_embed, v_embed, binary_op_type)
logit = l.fc(input=edge_embed, size=1)
loss = l.sigmoid_cross_entropy_with_logits(logit, l.cast(label, 'float32'))
loss = l.reduce_mean(loss)
prob = l.sigmoid(logit)
return pyreader, loss, prob, label
上面有一个函数binary_op
def binary_op(u_embed, v_embed, binary_op_type):
"""
此函数利用节点的Embedding来生成对应边的Embedding
"""
if binary_op_type == "Average":
edge_embed = (u_embed + v_embed) / 2
elif binary_op_type == "Hadamard":
edge_embed = u_embed * v_embed
elif binary_op_type == "Weighted-L1":
edge_embed = l.abs(u_embed - v_embed)
elif binary_op_type == "Weighted-L2":
edge_embed = (u_embed - v_embed) * (u_embed - v_embed)
else:
raise ValueError(binary_op_type + " binary_op_type doesn't exists")
return edge_embed
4. 训练
while 1:
try:
train_loss_val, train_probs_val, train_labels_val = exe.run(
train_prog,
fetch_list=[train_loss, train_probs, train_labels],
return_numpy=True)
fpr, tpr, thresholds = metrics.roc_curve(train_labels_val,
train_probs_val)
train_auc = metrics.auc(fpr, tpr)
step += 1
if step == 1 or step % 10 == 0:
log.info("Step %d " % step + "Train Loss: %f " % train_loss_val +
"Train AUC: %f " % train_auc)
except fluid.core.EOFException:
train_pyreader.reset()
break
test_pyreader.start()
test_probs_vals, test_labels_vals = [], []
while 1:
try:
test_loss_val, test_probs_val, test_labels_val = exe.run(
test_prog,
fetch_list=[test_loss, test_probs, test_labels],
return_numpy=True)
test_probs_vals.append(
test_probs_val), test_labels_vals.append(test_labels_val)
except fluid.core.EOFException:
test_pyreader.reset()
test_probs_array = np.concatenate(test_probs_vals)
test_labels_array = np.concatenate(test_labels_vals)
fpr, tpr, thresholds = metrics.roc_curve(test_labels_array,
test_probs_array)
test_auc = metrics.auc(fpr, tpr)
if step == 1 or step % 10 == 0:
log.info("\t\tStep %d " % step + "Test Loss: %f " %
test_loss_val + "Test AUC: %f " % test_auc)
break
5. 总结
整体来说,代码不难,但是有两个比较魔性的地方:
- 在
load_data
函数中,构造图的过程中,两个几点构造边时,比较节点列表的长度决定哪个节点在前面,哪个节点在后面,没明白是啥意思?或者对后面有什么帮助。 - 可以通过node embedding生成边的embedding方式,这个还是很有意思的。