文章目录
1.run
G = nx.read_edgelist('./data/wiki/Wiki_edgelist.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
1.1.read_edgelist
@open_file(0, mode="rb")
def read_edgelist(
path,
comments="#",
delimiter=None,
create_using=None,
nodetype=None,
data=True,
edgetype=None,
encoding="utf-8",
):
lines = (line if isinstance(line, str) else line.decode(encoding) for line in path)
return parse_edgelist(
lines,
comments=comments,
delimiter=delimiter,
create_using=create_using,
nodetype=nodetype,
data=data,
)
- open_file 是 nx 自定义的装饰器,下面是解释作用
Normally, we'd want to use "with" to ensure that fobj gets closed.
However, the decorator will make `path` a file object for us,
and using "with" would undesirably close that file object.
Instead, we use a try block, as shown above.
When we exit the function, fobj will be closed, if it should be, by the decorator.
- 首先将文件按行为元素放到 lines 元组中,之后传入 parse_edgelist
1.2.parse_edgelist
def parse_edgelist(
lines, comments="#", delimiter=None, create_using=None, nodetype=None, data=True
):
from ast import literal_eval
G = nx.empty_graph(0, create_using)
for line in lines:
p = line.find(comments)
if p >= 0:
line = line[:p]
if not line:
continue
# split line, should have 2 or more
s = line.strip().split(delimiter)
if len(s) < 2:
continue
u = s.pop(0)
v = s.pop(0)
d = s
if nodetype is not None:
try:
u = nodetype(u)
v = nodetype(v)
except Exception as e:
raise TypeError(
f"Failed to convert nodes {u},{v} to type {nodetype}."
) from e
if len(d) == 0 or data is False:
# no data or data type specified
edgedata = {}
elif data is True:
# no edge types specified
try: # try to evaluate as dictionary
if delimiter == ",":
edgedata_str = ",".join(d)
else:
edgedata_str = " ".join(d)
edgedata = dict(literal_eval(edgedata_str.strip()))
except Exception as e:
raise TypeError(
f"Failed to convert edge data ({d}) to dictionary."
) from e
else:
# convert edge data to dictionary with specified keys and type
if len(d) != len(data):
raise IndexError(
f"Edge data {d} and data_keys {data} are not the same length"
)
edgedata = {}
for (edge_key, edge_type), edge_value in zip(data, d):
try:
edge_value = edge_type(edge_value)
except Exception as e:
raise TypeError(
f"Failed to convert {edge_key} data {edge_value} "
f"to type {edge_type}."
) from e
edgedata.update({edge_key: edge_value})
G.add_edge(u, v, **edgedata)
return G
- 调用 empty_graph 构造空图对象,然后将每行 string 分成节点,如果每行只有两个节点就调用 add_edge 添加节点,返回图
- 这里使用的文件没有 weight 权重数据,如果有的话会被存储在 d 中,然后转化成 edge_type 类型(这里是 int),以字典形式存储在 edgedata 中,作为参数传入 add_edge
1.3.empty_graph
@nodes_or_number(0)
def empty_graph(n=0, create_using=None, default=Graph):
if create_using is None:
G = default()
elif hasattr(create_using, "_adj"):
# create_using is a NetworkX style Graph
create_using.clear()
G = create_using
else:
# try create_using as constructor
G = create_using()
n_name, nodes = n
G.add_nodes_from(nodes)
return G
- 这里传入的 create_using 是构造函数 nx.DiGraph(),返回创建的空图
2.DeepWalk
model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
2.1.RandomWalker
class DeepWalk:
def __init__(self, graph, walk_length, num_walks, workers=1):
self.graph = graph
self.w2v_model = None
self._embeddings = {}
self.walker = RandomWalker(
graph, p=1, q=1,)
self.sentences = self.walker.simulate_walks(
num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)
class RandomWalker:
def __init__(self, G, p=1, q=1, use_rejection_sampling=False):
"""
:param G:
:param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
:param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
:param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec.
"""
self.G = G
self.p = p
self.q = q
self.use_rejection_sampling = use_rejection_sampling
- RandomWalker 是为 node2vec 实现的类,当 p,q 都为1时变成 deepwalk
2.2.simulate_walks
def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):
G = self.G
nodes = list(G.nodes())
results = Parallel(n_jobs=workers, verbose=verbose, )(
delayed(self._simulate_walks)(nodes, num, walk_length) for num in
partition_num(num_walks, workers))
walks = list(itertools.chain(*results))
- 实现并行
def _simulate_walks(self, nodes, num_walks, walk_length, ):
walks = []
for _ in range(num_walks):
random.shuffle(nodes)
for v in nodes:
if self.p == 1 and self.q == 1:
walks.append(self.deepwalk_walk(
walk_length=walk_length, start_node=v))
elif self.use_rejection_sampling:
walks.append(self.node2vec_walk2(
walk_length=walk_length, start_node=v))
else:
walks.append(self.node2vec_walk(
walk_length=walk_length, start_node=v))
return walks
- 以每个节点为起点 walk_length 长度的序列,循环 num_walks 次
2.3.deepwalk_walk
def deepwalk_walk(self, walk_length, start_node):
walk = [start_node]
while len(walk) < walk_length:
cur = walk[-1]
cur_nbrs = list(self.G.neighbors(cur))
if len(cur_nbrs) > 0:
walk.append(random.choice(cur_nbrs))
else:
break
return walk
- 产生一条随机游走序列
3.train
model.train(window_size=5, iter=3,embed_size=128)
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):
#这里参数size和iter
kwargs["sentences"] = self.sentences
kwargs["min_count"] = kwargs.get("min_count", 0)
kwargs["vector_size"] = embed_size
kwargs["sg"] = 1 # skip gram
kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax
kwargs["workers"] = workers
kwargs["window"] = window_size
kwargs["epochs"] = iter
print("Learning embedding vectors...")
model = Word2Vec(**kwargs)
print("Learning embedding vectors done!")
self.w2v_model = model
return model
- 构造word2vec模型返回
4.get_embeddings
embeddings = model.get_embeddings()
def get_embeddings(self, ):
if self.w2v_model is None:
print("model not train")
return {}
self._embeddings = {}
for word in self.graph.nodes():
self._embeddings[word] = self.w2v_model.wv[word]
return self._embeddings
Word2Vec.wv以word为索引拿到vec
5.evaluate_embeddings
evaluate_embeddings(embeddings)
def evaluate_embeddings(embeddings):
X, Y = read_node_label('./data/wiki/wiki_labels.txt')
tr_frac = 0.8
print("Training classifier using {:.2f}% nodes...".format(
tr_frac * 100))
clf = Classifier(embeddings=embeddings, clf=LogisticRegression())
clf.split_train_evaluate(X, Y, tr_frac)
5.1.read_node_label
def read_node_label(filename, skip_head=False):
fin = open(filename, 'r')
X = []
Y = []
while 1:
if skip_head:
fin.readline()
l = fin.readline()
if l == '':
break
vec = l.strip().split(' ')
X.append(vec[0])
Y.append(vec[1:])
fin.close()
return X, Y
- 读取节点编号 x 和节点的label y,存入向量X,Y
5.2.Classifier
class Classifier(object):
def __init__(self, embeddings, clf):
self.embeddings = embeddings
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
def train(self, X, Y, Y_all):
self.binarizer.fit(Y_all)
X_train = [self.embeddings[x] for x in X]
Y = self.binarizer.transform(Y)
self.clf.fit(X_train, Y)
def evaluate(self, X, Y):
top_k_list = [len(l) for l in Y]
Y_ = self.predict(X, top_k_list)
Y = self.binarizer.transform(Y)
averages = ["micro", "macro", "samples", "weighted"]
results = {}
for average in averages:
results[average] = f1_score(Y, Y_, average=average)
results['acc'] = accuracy_score(Y, Y_)
print('-------------------')
print(results)
return results
def predict(self, X, top_k_list):
X_ = numpy.asarray([self.embeddings[x] for x in X])
Y = self.clf.predict(X_, top_k_list=top_k_list)
return Y
def split_train_evaluate(self, X, Y, train_precent, seed=0):
state = numpy.random.get_state()
training_size = int(train_precent * len(X))
numpy.random.seed(seed)
shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
X_train = [X[shuffle_indices[i]] for i in range(training_size)]
Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
self.train(X_train, Y_train, Y)
numpy.random.set_state(state)
return self.evaluate(X_test, Y_test)
5.3.self.train
def train(self, X, Y, Y_all):
self.binarizer.fit(Y_all)
X_train = [self.embeddings[x] for x in X]
Y = self.binarizer.transform(Y)
self.clf.fit(X_train, Y)
MultiLabelBinarizer把标签转化为len(classType)长度的label向量,只有 y 类的位置为1,其他地方为0
5.3.1.TopKRanker
class TopKRanker(OneVsRestClassifier):
def predict(self, X, top_k_list):
probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
all_labels = []
for i, k in enumerate(top_k_list):
probs_ = probs[i, :]
labels = self.classes_[probs_.argsort()[-k:]].tolist()
probs_[:] = 0
probs_[labels] = 1
all_labels.append(probs_)
return numpy.asarray(all_labels)
- 以 LogisticRegression 为 estimator 构建 OneVsRestClassifier 模型,再由 TopKRanker 类继承,自定义 predict 的实现
5.4.self.evaluate
def evaluate(self, X, Y):
top_k_list = [len(l) for l in Y]
Y_ = self.predict(X, top_k_list)
Y = self.binarizer.transform(Y)
averages = ["micro", "macro", "samples", "weighted"]
results = {}
for average in averages:
results[average] = f1_score(Y, Y_, average=average)
results['acc'] = accuracy_score(Y, Y_)
print('-------------------')
print(results)
return results
6.plot_embeddings
def plot_embeddings(embeddings,):
X, Y = read_node_label('./data/wiki/wiki_labels.txt')
emb_list = []
for k in X:
emb_list.append(embeddings[k])
emb_list = np.array(emb_list)
model = TSNE(n_components=2)
node_pos = model.fit_transform(emb_list)
color_idx = {}
for i in range(len(X)):
color_idx.setdefault(Y[i][0], [])
color_idx[Y[i][0]].append(i)
for c, idx in color_idx.items():
plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
plt.legend()
plt.show()