DeepWalk详解资料
知乎专栏
https://zhuanlan.zhihu.com/p/56380812
github资料,本文以下述1和3资料为源本完善修改而成
https://github.com/shenweichen/GraphEmbedding
https://github.com/ZhaoLiang-GitHub/GraphEmbedding-Deeplearning
https://github.com/swallown1/GraphEmbedding
论文原文
Paper:Tang J, Qu M, Wang M, et al. Line: Large-scale information network embedding
代码环境配置
networkx==2.2
其余无要求
from __future__ import print_function
import pandas as pd
import random
#networkx用于构建图数据
import networkx as nx
#从gensim.models中导入已经写好的word2vec模型备用
from gensim.models import Word2Vec
## 用于分类问题训练
import numpy
from sklearn.metrics import f1_score, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
# TSNE用于降维
from sklearn.manifold import TSNE
import numpy as np
# 从sklearn导入线性回归模型
from sklearn.linear_model import LogisticRegression
# 用于画降维后的聚类图
import matplotlib.pyplot as plt
# 创建alias采样表
def create_alias_table(area_ratio):
"""
:param area_ratio: sum(area_ratio)=1
:return: accept,alias
"""
l = len(area_ratio)
accept, alias = [0] * l, [0] * l
small, large = [], []
area_ratio_ = np.array(area_ratio) * l
for i, prob in enumerate(area_ratio_):
if prob < 1.0:
small.append(i)
else:
large.append(i)
while small and large:
small_idx, large_idx = small.pop(), large.pop()
accept[small_idx] = area_ratio_[small_idx]
alias[small_idx] = large_idx
area_ratio_[large_idx] = area_ratio_[large_idx] - \
(1 - area_ratio_[small_idx])
if area_ratio_[large_idx] < 1.0:
small.append(large_idx)
else:
large.append(large_idx)
while large:
large_idx = large.pop()
accept[large_idx] = 1
while small:
small_idx = small.pop()
accept[small_idx] = 1
return accept, alias
## 定义根据alias表的采样算法
def alias_sample(accept, alias):
"""
:param accept:
:param alias:
:return: sample index
"""
N = len(accept)
i = int(np.random.random()*N)
r = np.random.random()
if r < accept[i]:
return i
else:
return alias[i]
# 定义随机游走
class RandomWalker:
def __init__(self, G, p=1, q=1):
"""
:param G:
param p: 返回参数,控制立即重新遍历节点的可能性。.
param q: In-out参数,允许搜索区分“向内”和“向外”节点
"""
self.G = G
self.p = p
self.q = q
# 选取节点的方式采用的是又放回的随机选取
# RandomWalk是一种可重复访问已访问节点的深度优先遍历
def deepwalk_walk(self, walk_length, start_node):
# 开始节点
walk = [start_node]
# 到达长度停止
while len(walk) < walk_length:
cur = walk[-1]
# 获取当前节点中的邻居节点
cur_nbrs = list(self.G.neighbors(cur))
if len(cur_nbrs) > 0:
# 从邻居节点中随机获取一个节点
walk.append(random.choice(cur_nbrs))
else:
break
return walk
# 游走仿真
def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
# :num_walks 将所有节点随机游走的次数
# :walk_length 每次游走的长度
G = self.G
nodes = list(G.nodes())
walks = self._simulate_walks(nodes, num_walks, walk_length)
# 多线程同时处理
# results = Parallel(n_jobs=workers, verbose=verbose, )(
# delayed(self._simulate_walks)(nodes, num, walk_length) for num in
# partition_num(num_walks, workers))
# walks = list(itertools.chain(*results))
return walks
def _simulate_walks(self, nodes, num_walks, walk_length, ):
# :num_walks 将所有节点随机游走的次数
# :walk_length 每次游走的长度
walks = []
for _ in range(num_walks):
random.shuffle(nodes)
for v in nodes:
if self.p == 1 and self.q == 1:
walks.append(self.deepwalk_walk(
walk_length=walk_length, start_node=v))
else:
walks.append(self.node2vec_walk(
walk_length=walk_length, start_node=v))
return walks
def get_alias_edge(self, t, v):
"""
compute unnormalized transition probability between nodes v and its neighbors give the previous visited node t.
:param t:
:param v:
:return:
"""
G = self.G
p = self.p
q = self.q
unnormalized_probs = []
for x in G.neighbors(v):
weight = G[v][x].get('weight', 1.0) # w_vx
if x == t: # d_tx == 0
unnormalized_probs.append(weight / p)
elif G.has_edge(x, t): # d_tx == 1
unnormalized_probs.append(weight)
else: # d_tx > 1
unnormalized_probs.append(weight / q)
norm_const = sum(unnormalized_probs)
normalized_probs = [
float(u_prob) / norm_const for u_prob in unnormalized_probs]
return create_alias_table(normalized_probs)
def preprocess_transition_probs(self):
"""
Preprocessing of transition probabilities for guiding the random walks.
"""
G = self.G
alias_nodes = {}
for node in G.nodes():
unnormalized_probs = [G[node][nbr].get('weight', 1.0)
for nbr in G.neighbors(node)]
norm_const = sum(unnormalized_probs)
normalized_probs = [
float(u_prob) / norm_const for u_prob in unnormalized_probs]
alias_nodes[node] = create_alias_table(normalized_probs)
alias_edges = {}
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
self.alias_nodes = alias_nodes
self.alias_edges = alias_edges
return
class TopKRanker(OneVsRestClassifier):
def predict(self, X, top_k_list):
probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
all_labels = []
for i, k in enumerate(top_k_list):
probs_ = probs[i, :]
labels = self.classes_[probs_.argsort()[-k:]].tolist() #找出得分最高的分类的下标
probs_[:] = 0 #将所有类的概率设置为0
probs_[labels] = 1 #将分类结果的概率设置为1
all_labels.append(probs_)
return numpy.asarray(all_labels)
# 定义分类训练函数
class Classifier(object):
def __init__(self, embeddings, clf):
self.embeddings = embeddings #给分类方法输入嵌入
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True) #多标签二进制器
def train(self, X, Y, Y_all):
self.binarizer.fit(Y_all)
X_train = [self.embeddings[x] for x in X]
Y = self.binarizer.transform(Y)
self.clf.fit(X_train, Y)
def evaluate(self, X, Y):
top_k_list = [len(l) for l in Y]
Y_ = self.predict(X, top_k_list)
Y = self.binarizer.transform(Y)
averages = ["micro", "macro", "samples", "weighted"]
results = {}
for average in averages:
results[average] = f1_score(Y, Y_, average=average)
results['acc'] = accuracy_score(Y,Y_)
print('-------------------')
print(results)
return results
print('-------------------')
def predict(self, X, top_k_list):
X_ = numpy.asarray([self.embeddings[x] for x in X])
Y = self.clf.predict(X_, top_k_list=top_k_list)
return Y
def split_train_evaluate(self, X, Y, train_precent, seed=0):
state = numpy.random.get_state()
training_size = int(train_precent * len(X))
numpy.random.seed(seed)
shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
X_train = [X[shuffle_indices[i]] for i in range(training_size)]
Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
self.train(X_train, Y_train, Y)
numpy.random.set_state(state)
return self.evaluate(X_test, Y_test)
def read_node_label(filename, skip_head=False):
fin = open(filename, 'r')
X = []
Y = []
while 1:
if skip_head:
fin.readline()
l = fin.readline()
if l == '':
break
vec = l.strip().split('\t') # hmdd20数据集
# vec = l.strip().split(' ') # wiki数据集
X.append(vec[0])
Y.append(vec[1:])
fin.close()
return X, Y
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/hmdd20/label.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings,):
X, Y = read_node_label('../data/hmdd20/label.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()
#得到嵌入之后,对这个嵌入向量的效果,以分类问题进行验证验证
def evaluate_embeddings(embeddings):
X, Y = read_node_label('../data/hmdd20/label.txt')
# wiki数据集
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
def plot_embeddings(embeddings, ):
X, Y = read_node_label('../data/hmdd20/label.txt')
# #wiki数据集
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()
if __name__ == '__main__':
# 读取节点之间的边
# G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
# create_using=nx.DiGraph(), nodetype=None,
# data=[('weight', int)])
G = nx.read_edgelist('../data/hmdd20/onlyknowndiseasemirnainteraction.txt',
create_using=nx.Graph(), nodetype=None)
#随机游走
walker = RandomWalker(G, p=1, q=1)
#获得节点访问序列
num_walks = 1 #每个节点生成几个句子
walk_length = 3 #生成的句子长度
sampling_workers = 1 #线程数
sentences = walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=sampling_workers, verbose=0)
# 对模型进行训练(根据已有序列,调用gensim中的词向量)
embed_size = 128
window_size = 5
word2vec_workers = 3
iter = 5
kwargs = {}
kwargs["sentences"] = sentences
kwargs["min_count"] = 0
kwargs["vector_size"] = embed_size #要学习到的嵌入纬度
kwargs["sg"] = 1 # skip gram
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax 层次softmax技术
kwargs["workers"] = word2vec_workers #线程数目
kwargs["window"] = window_size #采样训练集的窗口大小
kwargs["epochs"] = iter #训练多少个epoch
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")
embeddings = {}
for word in G.nodes():
embeddings[word] = model.wv[word]
print(embeddings['362'])
evaluate_embeddings(embeddings)
plot_embeddings(embeddings)